In [5]:
import numpy as np
from collections import Counter
import pandas as pd
import random

In [6]:
class K_Nearest_Neighbors:
  def __init__(self, data_set, k):
      self.ds = data_set
      self.k = k

  def predict(self, feature_set):
      distances = []
      for group in self.ds:
          for feature in self.ds[group]:
              e_d = np.linalg.norm(np.array(feature) - np.array(feature_set))
              distances.append([e_d, group])
      nearest = sorted(distances)[:self.k]
      votes = [d[1] for d in nearest]
      nearest_group = Counter(votes).most_common(1)[0]
      feature_set_group, self.confidence = nearest_group[0], nearest_group[1] / self.k
      return feature_set_group

  def test(self, test_data):
      correct = 0
      total = 0
      for group in test_data:
          for feature_set in test_data[group]:
              group_prediction = self.predict(feature_set)
              if group_prediction == group:
                  correct += 1
              total += 1
      accuracy = correct / total
      print("Accuracy =", accuracy)


In [7]:
# DATA
data_names = [
    'letter', 'x-box', 'y-box', 'width', 'height', 'onpix', 'x-bar',
    'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy',
    'y-ege', 'yegvx'
]
df = pd.read_csv("letter-recognition.data", names=data_names)
data_set = df.values.tolist()

# Trộn dữ liệu
random.shuffle(data_set)

# Tách dữ liệu thành tập huấn luyện và kiểm tra
train_data = {chr(i): [] for i in range(65, 91)}  # A-Z
test_data = {chr(i): [] for i in range(65, 91)}   # A-Z

split_index = int(0.8 * len(data_set))  # 80% train, 20% test
for i in data_set[:split_index]:
    train_data[i[0]].append(i[1:])
for i in data_set[split_index:]:
    test_data[i[0]].append(i[1:])

# Huấn luyện và đánh giá mô hình
knn = K_Nearest_Neighbors(train_data, k=5)
knn.test(test_data)

Accuracy = 0.95725
