In [None]:
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Hàm tính khoảng cách Euclidean
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))

# Hàm KNN cho 1 điểm test
def knn_predict(training_data, training_labels, test_point, k):
    distances = []
    for i in range(len(training_data)):
        dist = euclidean_distance(test_point, training_data[i])
        distances.append((dist, training_labels[i]))
    distances.sort(key=lambda x: x[0])
    k_nearest_labels = [label for _, label in distances[:k]]
    return Counter(k_nearest_labels).most_common(1)[0][0]

# Tải dataset MNIST
print("Downloading MNIST dataset...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist.data
y = mnist.target.astype(int)

# Giảm kích thước dataset để chạy nhanh (ví dụ: lấy 5000 mẫu train, 1000 mẫu test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, train_size=60000)

# Chuẩn hóa dữ liệu về [0, 1]
X_train = X_train / 255.0
X_test = X_test / 255.0

# Dự đoán cho toàn bộ tập test
print("Running KNN...")
y_pred = [knn_predict(X_train, y_train, x, k=12) for x in X_test]

# Độ chính xác
acc = accuracy_score(y_test, y_pred)
print("Accuracy on MNIST subset: %.2f%%" % (acc * 100))


Downloading MNIST dataset...
Running KNN...
