# KNN - K Nearest Neighbours

Importing libraries

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from ipynb.fs.defs.additional_metrics import *

Defining functions that will perform .fit() and .predict() while calculating execution time 

In [None]:
def knn_fit_with_time(knn, X_trainCV, y_trainCV, time_fit_tmp):
    start = time.time()     
    knn.fit(X_trainCV, y_trainCV)
    time_fit_tmp.append(time.time()-start)

In [None]:
def knn_predict_with_time(knn, X_testCV, time_pred_tmp):
    start = time.time()
    y_pred = knn.predict(X_testCV)
    time_pred_tmp.append(time.time()-start)
    return y_pred

## Trained on 30% samples of the original training dataset

In [None]:
train_data = pd.read_csv("../datasets/fashion-mnist_train.csv")
X_train = train_data.iloc[:,1:]
y_train = train_data.iloc[:,0]

test_data = pd.read_csv("../datasets/fashion-mnist_test.csv")
X_test = test_data.iloc[:,1:]
y_test = test_data.iloc[:,0]

X_train_std = (X_train - X_train.mean()) / X_train.std()
X_test_std = (X_test - X_test.mean()) / X_test.std()

In [None]:
X_train_30, X_test_30, y_train_30, y_test_30 = train_test_split(X_train_std, y_train, test_size=0.7, stratify=y_train, random_state=42)

In [51]:
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
acc = []
i, p_val, best_acc, best_k = 0, 0, 0,0

for k in range(1,11):
    for m in ['euclidean','manhattan','chebyshev','minkowski']:   
        for w in ['uniform','distance']:

            acc_tmp, time_fit_tmp, time_pred_tmp = [], [], []
            
            p_val = 3 if m == 'minkowski' else 2

            indexes = kf.split(X_train_30, y_train_30)
            
            for train_index, test_index in indexes:

                X_trainCV1 = X_train_30.iloc[train_index,:]
                y_trainCV1 = y_train_30.iloc[train_index]
                X_testCV1 = X_train_30.iloc[test_index,:]
                y_testCV1 = y_train_30.iloc[test_index]
        
                knn1 = KNeighborsClassifier(n_neighbors = k, metric = m, p = p_val, weights = w, n_jobs = 20)
                
                knn_fit_with_time(knn1, X_trainCV1, y_trainCV1, time_fit_tmp)
                y_pred1 = knn_predict_with_time(knn1, X_testCV1, time_pred_tmp)

                c1 = confusion_matrix(y_testCV1, y_pred1)   
                acc_tmp.append(np.trace(c1)/sum(sum(c1)))
                                
            print(f"{i+1}. K = {k}, m = {m}, w = {w}, Average accuracy: {np.mean(acc_tmp):.6f} | Average fit time: {np.mean(time_fit_tmp):.6f}s | Average predict time: {np.mean(time_pred_tmp):.6f}s")
            acc.append(np.mean(acc_tmp))
            i += 1
            
            if np.mean(acc_tmp) > best_acc:
                best_acc = np.mean(acc_tmp)
                best_k, best_m, best_w = k, m, w
            
    print('')
      
print('-------------------')
print('Best accuracy is in iteration number:', np.argmax(acc), ' | for k =', best_k, ',m =', best_m, ',w =', best_w)

#### Checking if different algorithms speed up the process

In [None]:
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

for a in ['ball_tree', 'kd_tree']:
    
    acc_tmp, time_fit_tmp, time_pred_tmp = [], [], []

    indexes = kf.split(X_train_30, y_train_30)

    for train_index, test_index in indexes:

        X_trainCV1 = X_train_30.iloc[train_index,:]
        y_trainCV1 = y_train_30.iloc[train_index]
        X_testCV1 = X_train_30.iloc[test_index,:]
        y_testCV1 = y_train_30.iloc[test_index]

        knn1 = KNeighborsClassifier(n_neighbors = best_k, algorithm = a, metric = best_m, weights = best_w, n_jobs = 20)

        knn_fit_with_time(knn1, X_trainCV1, y_trainCV1, time_fit_tmp)
        y_pred1 = knn_predict_with_time(knn1, X_testCV1, time_pred_tmp)

        c1 = confusion_matrix(y_testCV1, y_pred1)
        acc_tmp.append(np.trace(c1)/sum(sum(c1)))

    print(f"a = {a}, Average accuracy: {np.mean(acc_tmp):.6f} | Average fit time: {np.mean(time_fit_tmp):.6f}s | Average predict time: {np.mean(time_pred_tmp):.6f}s")

    if np.mean(time_fit_tmp) > best_time_fit:
        best_time_fit = np.mean(time_fit_tmp)
        best_a = a

print('')    
print('-------------------')
print('Best accuracy is for a =', best_a)

#### Training on 30% training samples, test on test dataset

In [None]:
knn1 = KNeighborsClassifier(n_neighbors = best_k, algorithm = best_a, metric = best_m, weights = best_w, n_jobs = 20)

start1 = time.time()
knn1.fit(X_train_30, y_train_30)
fit_time = time.time() - start1

start2 = time.time()
y_pred2 = knn1.predict(X_test_std)
pred_time = time.time() - start2

c2 = confusion_matrix(y_test, y_pred2, labels=[0,1,2,3,4,5,6,7,8,9])

print(f"Accuracy: {np.trace(c2)/sum(sum(c2)):.6f} | Average fit time: {fit_time:.6f}s | Average predict time: {pred_time:.6f}s")
print('-------------------')
print('KNN parameters: a =', best_a, ',k =', best_k, ',m =', best_m, ',w =', best_w)

.

## Trained on 100% samples of the original training dataset

In [None]:
train_data = pd.read_csv("../datasets/fashion-mnist_train.csv")
X_train = train_data.iloc[:,1:]
y_train = train_data.iloc[:,0]

test_data = pd.read_csv("../datasets/fashion-mnist_test.csv")
X_test = test_data.iloc[:,1:]
y_test = test_data.iloc[:,0]

X_train_std = (X_train - X_train.mean()) / X_train.std()
X_test_std = (X_test - X_test.mean()) / X_test.std()

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc, acc_tmp, time_fit_tmp, time_pred_tmp = [], [], [], []
     
indexes = kf.split(X_train, y_train)

for train_index, test_index in indexes:
    
    X_trainCV2 = X_train_std.iloc[train_index,:]
    y_trainCV2 = y_train.iloc[train_index]
    X_testCV2 = X_train_std.iloc[test_index,:]
    y_testCV2 = y_train.iloc[test_index]

    knn2 = KNeighborsClassifier(n_neighbors = best_k, algorithm = best_a, metric = best_m, weights = best_w, n_jobs = 20)
    
    knn_fit_with_time(knn2, X_trainCV2, y_trainCV2, time_fit_tmp)
    y_pred3 = knn_predict_with_time(knn2, X_testCV2, time_pred_tmp)
    
    c3 = confusion_matrix(y_testCV2, y_pred3)
    acc_tmp.append(np.trace(c3)/sum(sum(c3)))
    
    print(f"        Accuracy: {np.trace(c3)/sum(sum(c3)):.6f} | Fit time: {time_fit_tmp:.6f}s | Predict time: {time_pred_tmp:.6f}s")
    
print("")    
print('-------------------')
print(f"Average accuracy: {np.mean(acc_tmp):.6f} | Average fit time: {np.mean(time_fit_tmp):.6f}s | Average predict time: {np.mean(time_pred_tmp):.6f}s")

##### Training on 100% training samples, test on test dataset

In [None]:
knn2 =  KNeighborsClassifier(n_neighbors = best_k, algorithm = best_a, metric = best_m, weights = best_w, n_jobs = 20)

start = time.time()
knn2.fit(X_train_std, y_train)
fit_time = time.time() - start1

start2 = time.time()
y_pred4 = knn2.predict(X_test_std)
pred_time = time.time() - start2

c4 = confusion_matrix(y_test, y_pred4, labels=[0,1,2,3,4,5,6,7,8,9])

print(f"Accuracy: {np.trace(c4)/sum(sum(c4)):.6f} | Average fit time: {fit_time:.6f}s | Average predict time: {pred_time:.6f}s")
print('-------------------')
print('KNN parameters: a =', best_a, ',k =', best_k, ',m =', best_m, ',w =', best_w)

.

## Trained on 100% samples with PCA reduction

In [None]:
train_data = pd.read_csv("../datasets/fashion-mnist_train.csv")
X_train = train_data.iloc[:,1:]
y_train = train_data.iloc[:,0]

test_data = pd.read_csv("../datasets/fashion-mnist_test.csv")
X_test = test_data.iloc[:,1:]
y_test = test_data.iloc[:,0]

X_train_std = (X_train - X_train.mean()) / X_train.std()
X_test_std = (X_test - X_test.mean()) / X_test.std()

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc = []
i, p_val, best_acc, best_k, best_pca = 0, 0, 0, 0, 0

for k in range(1,11):
    for m in ['euclidean','manhattan','chebyshev']:   
        for pc in [0.8, 0.85, 0.9, 0.95]:

            acc_tmp, time_fit_tmp, time_pred_tmp = [], [], []
        
            indexes = kf.split(X_train, y_train)

            for train_index, test_index in indexes:

                X_trainCV3 = X_train_std.iloc[train_index,:]
                y_trainCV3 = y_train.iloc[train_index]
                X_testCV3 = X_train_std.iloc[test_index,:]
                y_testCV3 = y_train.iloc[test_index]

                pca = PCA(n_components=pc)
                pca.fit(X_trainCV3)
                X_train_r = pca.transform(X_trainCV3)
                X_test_r = pca.transform(X_testCV3)

                knn3 = KNeighborsClassifier(n_neighbors = k, metric = m, weights = 'distance', n_jobs = 20)
                         
                knn_fit_with_time(knn3, X_train_r, y_trainCV3, time_fit_tmp)
                y_pred5 = knn_predict_with_time(knn3, X_test_r, time_pred_tmp)
                    
                c5 = confusion_matrix(y_testCV3, y_pred5, labels=[0,1,2,3,4,5,6,7,8,9])
                acc_tmp.append(np.trace(c5)/sum(sum(c5)))

            print(f"{i+1}. pca = {pc}, K = {k}, m = {m}, Average accuracy: {np.mean(acc_tmp):.6f} | Average fit time: {np.mean(time_fit_tmp):.6f}s | Average predict time: {np.mean(time_pred_tmp):.6f}s")
            acc.append(np.mean(acc_tmp))
            i += 1

            if np.mean(acc_tmp) > best_acc:
                best_acc = np.mean(acc_tmp)
                best_k, best_m, best_pca = k, m, pc
            
        print('')
            
print('-------------------')
print('Best accuracy is in iteration number:', np.argmax(acc), ' | for k =', best_k, ',m =', best_m, ',pca =', best_pca, ',w = distance')

#### Checking if different algorithms speed up the process

In [None]:
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

for a in ['ball_tree', 'kd_tree']:

    acc_tmp, time_fit_tmp, time_pred_tmp = [], [], []

    indexes = kf.split(X_train, y_train)

    for train_index, test_index in indexes:

        X_trainCV3 = X_train_std.iloc[train_index,:]
        y_trainCV3 = y_train.iloc[train_index]
        X_testCV3 = X_train_std.iloc[test_index,:]
        y_testCV3 = y_train.iloc[test_index]

        pca = PCA(n_components = best_pca)
        pca.fit(X_trainCV3)
        X_train_r = pca.transform(X_trainCV3)
        X_test_r = pca.transform(X_testCV3)

        knn3 = KNeighborsClassifier(n_neighbors = best_k, algorithm = a, metric = best_m, weights = 'distance', n_jobs = 20)
        
        knn_fit_with_time(knn3, X_train_r, y_trainCV3, time_fit_tmp)
        y_pred5 = knn_predict_with_time(knn3, X_test_r, time_pred_tmp)
        
        c5 = confusion_matrix(y_testCV3, y_pred5, labels=[0,1,2,3,4,5,6,7,8,9])
        acc_tmp.append(np.trace(c5)/sum(sum(c5)))

    print(f"a = {a}, Average accuracy: {np.mean(acc_tmp):.6f} | Average fit time: {np.mean(time_fit_tmp):.6f}s | Average predict time: {np.mean(time_pred_tmp):.6f}s")

    if np.mean(time_fit_tmp) > best_time_fit:
        best_time_fit = np.mean(time_fit_tmp)
        best_a = a

print('')    
print('-------------------')
print('Best accuracy is for a =', best_a)

##### Training on 100% training samples with PCA reduction, test on test dataset

In [None]:
pca = PCA(n_components=best_pca)
pca.fit(X_train_std)
X_train_r = pca.transform(X_train_std)
X_test_r = pca.transform(X_test_std)

knn3 = KNeighborsClassifier(n_neighbors = best_k, algorithm = best_a, metric = best_m, weights = "distance", n_jobs = 20)

start1 = time.time()
knn3.fit(X_train_r, y_train)
fit_time = time.time() - start1

start2 = time.time()
y_pred6 = knn3.predict(X_test_r)
pred_time = time.time() - start2

c6 = confusion_matrix(y_test, y_pred6, labels=[0,1,2,3,4,5,6,7,8,9])

print(f"Accuracy: {np.trace(c6)/sum(sum(c6)):.6f} | Average fit time: {fit_time:.6f}s | Average predict time: {pred_time:.6f}s")
print('-------------------')
print('KNN parameters: a =', best_a, ',k =', best_k, ',m =', best_m, ',pca =', best_pca, ',w = distance')

In [None]:
# confusion matrix
print(c6)

In [None]:
print(accuracy_per_class(c6, y_test.unique()))

In [None]:
print(sensitivity_per_class(c6, y_test.unique()))

## Examples of wrong classification

In [None]:
y_pred_np = y_pred6
print(type(y_pred6))

y_test_np = y_test.to_numpy()
print(type(y_test_np))

In [None]:
def differences(a,b):
    list = []
    for j in range(len(a)):
        if b[j] != a[j]:
            list = list + [j]        
    arrayIndexes = np.asarray(list)
    return arrayIndexes

In [None]:
diff = differences(y_pred_np, y_test_np)
print(diff[:9])

In [None]:
plt.figure(figsize=(8,8))

for j in range(0,9):  
    plt.subplot(3,3,j+1)    
    arr = X_test.iloc[diff[j+100],:].to_numpy()
    
    arr = arr.reshape(28,28)
    arr = arr.astype(np.uint8)
    img = Image.fromarray(arr, "L")
    
    plt.imshow(img, cmap = 'gray')
    plt.title(f"Predicted {y_pred_np[diff[j+100]]}, Real {y_test_np[diff[j+100]]}")
    plt.axis("off")

plt.show()