# kNN implementation

BigO inference: O(nm + klogn) for one test data, where n is number of training observation and m is number of features

In [22]:
import numpy as np
from scipy.spatial.distance import cdist
from heapq import heapify, heappop
from collections import Counter

In [33]:
class kNearestNeighbor():
    def __init__(self, k, tp):
        self.k = k
        self.type = tp
        
    def predict(self, X_test, X_train, y_train):
        num_test = len(X_test)
        dist_matrix = cdist(X_test, X_train, 'euclidean') # (number of train) * (number of test)
        y_test = []
        for i in range(num_test):
            # use heap O(klogn)
            dists = [(x, idx) for idx, x in enumerate(dist_matrix[i])]
            heapify(dists)
            preds = []
            for _ in range(self.k):
                _, idx = heappop(dists)
                preds.append( y_train[idx] )
            
            if self.type == 'classification':
                counter = Counter(preds)
                pred = counter.most_common(1)[0][0]
            if self.type == 'regression':
                pred = preds.mean()
                
            y_test.append(pred)
            
        return y_test

# Test on Sklearn Iris dats

In [6]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [34]:
kNN = kNearestNeighbor(5, 'classification')
predictions = kNN.predict(X_test, X_train, y_train)

In [37]:
acc = sum(y_test == predictions) / len(y_test)
print(f'accuracy on the test set is {acc}')

accuracy on the test set is 1.0
