In [None]:
import numpy as np
import scipy.stats

from abc import ABC, abstractmethod
from typing import List, Tuple

In [None]:
RANDOM_STATE = 42

## Define KNNRegressor and KNNClassifier models

In [None]:
class KNNModel(ABC):
  def __init__(self, n_neighbors: int):
    self.n_neighbors = n_neighbors

  def fit(self, X: np.array, y: np.array):
    """
    Memorizes the training samples.
    """
    self.X = X
    self.y = y

  @abstractmethod
  def _get_prediction_for_sample(self, k_nearest_neighbors: list):
    pass

  def predict(self, X: np.array):
    """
    Predicts label for new samples.
    """

    # List of predictions
    predictions = []

    for i in range(len(X)):
      curr_sample = X[i]
      k_nearest_neighbors = self.get_neighbors(curr_sample)
      prediction = self._get_prediction_for_sample(k_nearest_neighbors)
      predictions.append(prediction)

    return np.array(predictions)


  def get_distance(self, point_a: np.array, point_b: np.array):
    """
    Helper function to calculate distance between two samples:

    dist(a,b) = sqrt(summatory((a_i - b_i)^2))
    """
    return np.sum((point_a - point_b) ** 2) ** .5

  def get_neighbors(self, new_sample: np.array):
    """
    Helper function to find and rank neighbors by distance.

    1. Calculate distances between the new sample and every other point.
    2. Sort seen points by distance.
    3. Return indices of the `self.n_neighbors` closest
      points to the new sample
    """

    # List that stores a tuple containing (sample_id, sample_distance)
    distances: List[Tuple[int, float]] = []

    # 1. Calculate distances between the new sample and every other point.
    for i in range(len(self.X)):
      curr_dist = self.get_distance(new_sample, self.X[i])
      distances.append((i, curr_dist))

    # 2. Sort seen points by distance.
    distances.sort(key=lambda x: x[1])

    # 3. Return indices of the `self.n_neighbors` closest neighbors
    return [i[0] for i in distances[:self.n_neighbors]]


In [None]:
class KNNRegressor(KNNModel):
  def _get_prediction_for_sample(self, k_nearest_neighbors: list):
    k_y_values  = self.y[k_nearest_neighbors]
    return sum(k_y_values) / self.n_neighbors


In [None]:
class KNNClassifier(KNNModel):
  def _get_prediction_for_sample(self, k_nearest_neighbors: list):
    k_y_values = self.y[k_nearest_neighbors]
    return scipy.stats.mode(k_y_values)[0]

## Regression

### Prepare data

In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

reg_X, reg_y = make_regression(
  n_targets=1,
  n_samples=500,
  n_features=20,
  n_informative=15,
  random_state=RANDOM_STATE
)

reg_X_train, reg_X_test, reg_y_train, reg_y_test = train_test_split(
  reg_X, reg_y, test_size=.3, random_state=RANDOM_STATE
)

In [None]:
reg_X_train.shape, reg_X_test.shape

((350, 20), (150, 20))

### Define model

In [None]:
regressor = KNNRegressor(n_neighbors=10)

In [None]:
regressor.fit(reg_X_train, reg_y_train)

### Evaluate performance

In [None]:
from sklearn.metrics import mean_squared_error

reg_y_pred = regressor.predict(reg_X_test)

print(mean_squared_error(reg_y_test, reg_y_pred, squared=False))

139.12043248353768


## Classification

### Prepare data

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

clf_X, clf_y = make_classification(
  n_samples=500,
  n_features=20,
  n_informative=15,
  random_state=RANDOM_STATE,
)

clf_X_train, clf_X_test, clf_y_train, clf_y_test = train_test_split(
  clf_X, clf_y, test_size=.3, random_state=RANDOM_STATE
)

In [None]:
clf_X_train.shape, clf_X_test.shape

((350, 20), (150, 20))

### Define model

In [None]:
classifier = KNNClassifier(n_neighbors=10)

In [None]:
classifier.fit(clf_X_train, clf_y_train)

### Evaluate performance

In [None]:
from sklearn.metrics import f1_score

clf_y_pred = classifier.predict(clf_X_test)

print(f1_score(clf_y_test, clf_y_pred))

0.835820895522388
