In [2]:
import pdb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.datasets import load_breast_cancer

#### A simple but inefficient implementation of KNN

In [15]:
class SimpleKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        """The fitting process just naively storing the entire training set."""
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        """
        The prediction is to find the closest k points 
        for each test data; then find the majority label 
        of the k points
        """
        ## X_train: (n, m)
        ## X_test:  (h, m)
        ## Broadcasting by adding new axese to X_train and X_test
        ## X_train: (1, n, m)
        ## X_test:  (h, 1, m)
        ## as a result, X_test - X_train becomes: (h, 1, m) - (1, n, m) -> (h, n, m)
        ## Compute the distance across the last axis, i.e., np.linalg.norm((h, n, m), axis=2), 
        ## which gives us an array of dimension (h, n), where the (i,j)-th entry is the distance 
        ## between the i-th test data and the j-th training data
        dist = np.linalg.norm((X_test[:, np.newaxis, :] - X_train[np.newaxis, :, :])**2, axis=2)

        ## find the indices of the k nearest neighbors of each test data
        idx = np.argsort(dist, axis=1)[:, :self.k]

        ## find the labels of the k nearest neighbors 
        ## idx: (h, k)
        ## y_train: (n, )
        ## y_train[idx]: (h, k)
        knn_labels = self.y_train[idx]

        ## find the majority of the labels of the k nearest neighbors
        preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=knn_labels)
        return preds

In [16]:
## Run the simple KNN on a cancer dataset
cancer_data = load_breast_cancer()
X, y = cancer_data.data, cancer_data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = SimpleKNN(k=5)
clf.fit(X_train, y_train)
y_preds = clf.predict(X_test)

## calculate the accuracy on test data
print(np.count_nonzero(y_preds == y_test) / len(y_test))

0.9473684210526315
