In [61]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

%run util.ipynb

# knn model 

In [62]:
# TODO confusion matrix + learning curve 

### default hyperparams

In [63]:
X, Y = get_data()

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

In [64]:
# scaling  TODO note paper did not apply scaling prior 

scaler = StandardScaler()

Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

In [65]:
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

knn.fit(Xtrain, Ytrain)

In [66]:
Ypred = knn.predict(Xtest)

accuracy_score(Ytest, Ypred)

0.6979166666666666

In [67]:
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       0.66      0.86      0.75        50
           1       0.77      0.52      0.62        46

    accuracy                           0.70        96
   macro avg       0.72      0.69      0.69        96
weighted avg       0.72      0.70      0.69        96



### tuning hyperparams using gridsearch

In [68]:
# https://www.datasklr.com/select-classification-methods/k-nearest-neighbors
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [69]:
# see dafaults params 

knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [70]:
param_grid_knn = {
    'n_neighbors': range(1,31),
    'metric': ['euclidean', 'manhattan', 'chebyshev']
}

grid_search_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, cv=10, scoring='accuracy', n_jobs=-1)


In [71]:
grid_search_knn.fit(Xtrain, Ytrain)

In [72]:
grid_search_knn.best_params_, grid_search_knn.best_score_ # result on training data

({'metric': 'chebyshev', 'n_neighbors': 13}, np.float64(0.6942687747035573))

In [73]:
best_knn = grid_search_knn.best_estimator_ # result on test data
Ypred = grid_search_knn.predict(Xtest)

accuracy_score(Ytest, Ypred)

0.6666666666666666

In [74]:
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       0.64      0.84      0.72        50
           1       0.73      0.48      0.58        46

    accuracy                           0.67        96
   macro avg       0.68      0.66      0.65        96
weighted avg       0.68      0.67      0.65        96



### check for overfitting 

In [75]:
# compare performandce on training vs test

YtrainPred = grid_search_knn.best_estimator_.predict(Xtrain)

accuracy_score(Ytrain, YtrainPred)


0.7354260089686099

In [76]:
# cross validation score (ie accuracy of held out subsets of training data during the k fold cv, ie how model is expetcted to perform on unseen data)

grid_search_knn.best_score_

np.float64(0.6942687747035573)