In [11]:
#Name: Hien Minh Quan Duong
#I performed cross-validation on 20% of the training data, trying values of K from 1 to 21. The value of K=7 yielded the highest accuracy of 90,00% on the validation set.
#The accuracy you got: 90.0%

import csv
import math

# Calculate Euclidean distance between two vectors
def euclidean_distance(vector1, vector2):
    squared_sum = sum((x - y) ** 2 for x, y in zip(vector1, vector2))
    return math.sqrt(squared_sum)

# Perform weighted KNN classification
def weighted_knn(train_data, test_sample, k):
    distances = []
    for train_sample in train_data:
        train_vector = train_sample[1:] 
        distance = euclidean_distance(test_sample[1:], train_vector)
        distances.append((train_sample[0], distance))  
   
    # Sort distances in ascending order
    distances.sort(key=lambda x: x[1])
   
    # Weighted voting
    vote_counts = {}
    for i in range(k):
        class_label, distance = distances[i]
        if distance == 0:
            weight = float('inf')
        else:
            weight = 1.0 / distance
        vote_counts[class_label] = vote_counts.get(class_label, 0) + weight
   
   
    return max(vote_counts.items(), key=lambda x: x[1])[0]


train_data = []
with open('MNIST_train.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  
    for row in reader:
        train_data.append(list(map(int, row)))  


test_data = []
with open('MNIST_test.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  
    for row in reader:
        test_data.append(list(map(int, row)))  

# Value of k 
k = 7
print(f"\nK = {k}")

# Evaluate on test data
num_test_samples = len(test_data)
num_misclassified = 0
for test_sample in test_data:
    desired_class = test_sample[0]
    computed_class = weighted_knn(train_data, test_sample, k)
    print(f"Desired class: {desired_class} computed class: {computed_class}")
    if desired_class != computed_class:
        num_misclassified += 1

# Calculate accuracy
accuracy = (num_test_samples - num_misclassified) / num_test_samples

print(f"Accuracy rate: {accuracy * 100:.1f}%")
print(f"Number of misclassified test samples: {num_misclassified}")
print(f"Total number of test samples: {num_test_samples}")



K = 7
Desired class: 0 computed class: 0
Desired class: 0 computed class: 0
Desired class: 0 computed class: 0
Desired class: 0 computed class: 0
Desired class: 0 computed class: 0
Desired class: 1 computed class: 1
Desired class: 1 computed class: 1
Desired class: 1 computed class: 1
Desired class: 1 computed class: 1
Desired class: 1 computed class: 1
Desired class: 2 computed class: 8
Desired class: 2 computed class: 2
Desired class: 2 computed class: 2
Desired class: 2 computed class: 6
Desired class: 2 computed class: 2
Desired class: 3 computed class: 9
Desired class: 3 computed class: 3
Desired class: 3 computed class: 3
Desired class: 3 computed class: 3
Desired class: 3 computed class: 3
Desired class: 4 computed class: 4
Desired class: 4 computed class: 4
Desired class: 4 computed class: 4
Desired class: 4 computed class: 4
Desired class: 4 computed class: 9
Desired class: 5 computed class: 5
Desired class: 5 computed class: 6
Desired class: 5 computed class: 5
Desired class