# kNNAlgorithm class testing

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import sys
sys.path.append('../')
from src.datapipeline import MLDataset
from src.algorithms.kNNAlgorithm import kNNAlgorithm
from pathlib import Path
import itertools
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [22]:
data_path = Path('../data/folded_datasets/raw/vowel/')
save_path = Path('../Results/Tables/vowel/')
ds = MLDataset(data_path)

In [23]:
params = {'k':[1,3,5,7],
          'distance': ['euclidean', 'manhattan', 'cosine'],
          'votes': ['majority', 'sheppard', 'idw'],
          'weights': ['equal','correlation','information_gain']}
keys, values = zip(*params.items())

In [24]:
params_combs = [dict(zip(keys, v)) for v in itertools.product(*values)]
results, fold_means = {},{}
for n, comb in enumerate(params_combs[:2]):
    fold_res = []
    for fold, (TrainMatrix, TestMatrix) in enumerate(ds):
        knnalg = kNNAlgorithm()
        knnalg.fit(TrainMatrix, comb['weights'])
        preds, time = knnalg.predict_test(TestMatrix.iloc[:,:-1], comb['k'], comb['distance'], comb['votes'], True)
        acc, corr, incorr = knnalg.evaluate(TestMatrix['y_true'])
        results[n*10+(fold)], fold_res  = [fold, *comb.values(), acc,corr,incorr,time], fold_res + [[acc,corr,incorr,time]]
    fold_means[n] = np.array(fold_res).mean(0)
results, fold_means = pd.DataFrame(results).T, pd.DataFrame(fold_means).T
results.columns, fold_means.columns = ['fold', 'k', 'distance', 'votes', 'weighting', 'acc', 'corr', 'incorr', 'time'], ['acc', 'corr', 'incorr', 'time']
results.to_csv(save_path.as_posix() + '/fold_res.csv'), fold_means.to_csv(save_path.as_posix() + '/fold_mean_res.csv')

## Testing one at a time case with iris

In [7]:
def to_dataframe(y):
    if not isinstance(y, pd.DataFrame):
        return pd.DataFrame(y)
        
def iris_dataset():
    x, y = load_iris(return_X_y=True, as_frame=True)
    y = to_dataframe(y)
    return x, y

In [11]:
X,y = iris_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify = y)
TrainData = pd.concat([X_train, y_train], axis = 1)
TrainData = TrainData.rename(columns={"target": "y_true"})

knnalg = kNNAlgorithm()
knnalg.fit(TrainData, 'equal')
for i in range(len(X_test)):
    print('Prediction for case {} is correct: {}'.format(i,knnalg.predict(X_test.iloc[i], 3, 'euclidean', 'majority')==y_test.iloc[i]))

Prediction for case 0 is correct: target    True
Name: 58, dtype: bool
Prediction for case 1 is correct: target    True
Name: 134, dtype: bool
Prediction for case 2 is correct: target    True
Name: 147, dtype: bool
Prediction for case 3 is correct: target    True
Name: 69, dtype: bool
Prediction for case 4 is correct: target    True
Name: 107, dtype: bool
Prediction for case 5 is correct: target    True
Name: 42, dtype: bool
Prediction for case 6 is correct: target    True
Name: 38, dtype: bool
Prediction for case 7 is correct: target    True
Name: 7, dtype: bool
Prediction for case 8 is correct: target    True
Name: 132, dtype: bool
Prediction for case 9 is correct: target    True
Name: 75, dtype: bool
Prediction for case 10 is correct: target    True
Name: 14, dtype: bool
Prediction for case 11 is correct: target    True
Name: 116, dtype: bool
Prediction for case 12 is correct: target    True
Name: 51, dtype: bool
Prediction for case 13 is correct: target    True
Name: 77, dtype: boo

In [14]:
# Using the method to predict all the test set
print(knnalg.predict_test(X_test, 3, 'euclidean', 'majority', True))
knnalg.evaluate(y_test['target'])

([1, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 2, 1, 1, 0], 0.09569478034973145)


(1.0, 15, 0)