# Laboratory 4

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Load data and train/test split

In [4]:
df = pd.read_csv('../datasets/iris.csv', header=None, dtype={4: 'category'})
df

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
X = df.loc[:, :3].to_numpy()
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [6]:
y = df.loc[:, 4].cat.codes.values
y[:5]

array([0, 0, 0, 0, 0], dtype=int8)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

## Implement KNN 

In [8]:
class KNearestNeighbors:

    def __init__(self, k, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def predict(self, X):
        from collections import Counter
        distance_metrics = {
            'euclidean': lambda x1, x2: np.sqrt(np.sum(np.square(x1 - x2))),
            'cosine': lambda x1, x2: 1 - np.abs(np.dot(x1, x2)/(np.linalg.norm(x1)*np.linalg.norm(x2))),
            'manhattan': lambda x1, x2: np.sum(np.abs(x1 - x2))
        }
        return max(Counter(
            [a[1] for a in sorted((distance_metrics[self.distance_metric](X, x), y) for (x, y) in zip(X_train, y_train))[:self.k]]
        ).items(), key=lambda a: a[1])[0]

## Evaluate distance metrics on Iris

In [9]:
clf = KNearestNeighbors(k=int(np.sqrt(len(X))))
clf.fit(X_train, y_train)

accuracy_score(y_test, [clf.predict(x) for x in X_test])

1.0

In [10]:
clf = KNearestNeighbors(k=int(np.sqrt(len(X))), distance_metric='cosine')
clf.fit(X_train, y_train)
accuracy_score(y_test, [clf.predict(x) for x in X_test])

0.9666666666666667

In [11]:
clf = KNearestNeighbors(k=int(np.sqrt(len(X))), distance_metric='manhattan')
clf.fit(X_train, y_train)
accuracy_score(y_test, [clf.predict(x) for x in X_test])

1.0