In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Prepare data

In [10]:
df = pd.read_csv('../datasets/mnist_test.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X = df.loc[:, 1:785:8].to_numpy()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
y = df.loc[:, 0].to_numpy()
y

array([7, 2, 1, ..., 4, 5, 6])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

## Define model

In [44]:
class KNearestNeighbors:

    def __init__(self, k, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def predict(self, X):
        from collections import Counter
        distance_metrics = {
            'euclidean': lambda x1, x2: np.sqrt(np.sum(np.square(x1 - x2))),
            'cosine': lambda x1, x2: 1 - np.abs(np.dot(x1, x2)/(np.linalg.norm(x1)*np.linalg.norm(x2))),
            'manhattan': lambda x1, x2: np.sum(np.abs(x1 - x2))
        }
        return max(Counter(
            [a[1] for a in sorted((distance_metrics[self.distance_metric](X, x), y) for (x, y) in zip(X_train, y_train))[:self.k]]
        ).items(), key=lambda a: a[1])[0]

## Evaluate

In [50]:
clf = KNearestNeighbors(k=int(np.sqrt(X_train.shape[0])))
clf.fit(X_train, y_train)

In [54]:
y_pred = [clf.predict(x) for x in X_test]

In [58]:
accuracy_score(y_test, y_pred)

0.8145