<h2>This Notebook contains an implementation of K-nearest neighbors in python</h2>
<p> Feel free to download and play around with it, or rebuild it.</p>


<h2> 1. Implementation by Hand. </h2>

In [None]:
# Usual suspects
import numpy as np
import pandas as pd

import kagglehub
kagglehub.login()

In [9]:
# Download dataset from kaggle on google colab
kagglehub.dataset_download("bulentsiyah/knearest-neighbour-knn-classification")

Using Colab cache for faster access to the 'knearest-neighbour-knn-classification' dataset.


'/kaggle/input/knearest-neighbour-knn-classification'

In [144]:
# Load the dataset we downloaded from kaggle
df = pd.read_csv('/kaggle/input/knearest-neighbour-knn-classification/data.csv')
df_train = df[:int(0.9 * len(df))]
df_test = df[int(0.9 * len(df)):]
df.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [145]:
from dataclasses import dataclass, field
# All of the input features are numeric and target is category (M or B)
print(df.select_dtypes(include="number").shape[1], df.shape[1])
# Binary categorization let M = 1 B = 0
print(set(df.select_dtypes(include="object").squeeze().values))

@dataclass
class Dataset:
    X : np.ndarray = field(default_factory = lambda: np.empty(0))
    y : np.ndarray = field(default_factory = lambda: np.empty(0))

    def __getitem__(self, index):
        return

dataset = Dataset()

dataset.X = np.delete(df_train.drop('diagnosis', axis = 1).to_numpy(), -1, axis = 1)
dataset.y = (df_train['diagnosis'] == 'M').to_numpy()

32 33
{'B', 'M'}


In [127]:
# Implement a distance metric
def euclidean_distance(p1 : np.ndarray, p2 : np.ndarray) -> np.float64:
    if (p1.ndim == 1):
        p1 = p1.reshape(1,-1)
    if (p2.ndim == 1):
        p2 = p2.reshape(1,-1)
    #Input = (case, 1, features) - (1, total samples,features)
    #Otput = (case, total_samples)
    return np.sqrt(((p1[:, None, :] - p2[None, :, :]) ** 2).sum(axis = 2))

In [177]:
class KNNClassifier:
    def __init__(self) -> None:
        return

    def fit(self, dataset : Dataset, k) -> None:
        self.dataset = dataset
        self.k = k

    def predict(self, point : np.ndarray) -> str:
        # Calculate distances
        distances = euclidean_distance(point, dataset.X)
        # Sort distances but get the indices of the pints and pick only the closes k
        # For each sample
        indexes_of_closest_points = np.argsort(distances, axis = 1)[:, :self.k]
        # Get the labels of the closest points
        labels_of_closest_points = self.dataset.y[indexes_of_closest_points].astype(float)
        # Check the majority category
        sum_of_labels = labels_of_closest_points.sum(axis = 1)
        target_labels = (sum_of_labels > self.k / 2)
        # Return the result in proper format if you want
        #target_labels = np.where(target_labels, 'M', 'B')
        return target_labels

knn = KNNClassifier()
knn.fit(dataset, 5)


def evaluate(y_true : np.ndarray, y_preds : np.ndarray) -> np.float64:
    return np.sum(y_true == y_preds).astype(float) / (len(y_true))


In [176]:
dataset_test = Dataset()
dataset_test.X = np.delete(df_test.drop('diagnosis', axis = 1).to_numpy(), -1, axis = 1)
dataset_test.y = (df_test['diagnosis'] == 'M').to_numpy()

y_preds = knn.predict(dataset_test.X)
print(f'Accuracy: {evaluate(dataset_test.y, y_preds)}')

Accuracy: 0.7543859649122807
