In [13]:
import timeit
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

## Using the Iris dataset, because we have more than 2 classifications the code would need to change a bit in order to use precision, recall, or f1 score.

In [14]:
score_metric = 'accuracy'

iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

kf = KFold(10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Common grid hyper parameter settings for KNN:

In [15]:
knn_pipe = Pipeline([('scaler', StandardScaler()), ('classifier', KNeighborsClassifier())])

grid_params = [{
    'scaler__with_mean': [True],
    'scaler__with_std': [True],
    'classifier__algorithm': ['ball_tree'], # 'auto', 'ball_tree', 'kd_tree', 'brute'
    'classifier__leaf_size': [186],
    'classifier__n_neighbors': [3],
    'classifier__weights': ['uniform', 'distance'] #'uniform', 'distance'
}]

grid = GridSearchCV(knn_pipe, grid_params, cv=kf, scoring = score_metric)

timing = timeit.timeit(lambda: grid.fit(X_train, y_train), number=1)
score = grid.score(X_test, y_test)

print("KNN Testing Score: {}".format(score))
print("seconds:", timing)
print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)

KNN Testing Score: 0.78
seconds: 0.07386538700006895
0.72
{'classifier__algorithm': 'ball_tree', 'classifier__leaf_size': 186, 'classifier__n_neighbors': 3, 'classifier__weights': 'uniform', 'scaler__with_mean': True, 'scaler__with_std': True}
Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 KNeighborsClassifier(algorithm='ball_tree', leaf_size=186,
                                      n_neighbors=3))])
