In [0]:
import pandas as pd

In [0]:
dataset = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter08/Dataset/Breast-cancer-wisconsin.data')

In [36]:
dataset.head

<bound method NDFrame.head of      1000025  5   1  1.1  1.2  2 1.3   3  1.4  1.5  2.1
0    1002945  5   4    4    5  7  10   3    2    1    2
1    1015425  3   1    1    1  2   2   3    1    1    2
2    1016277  6   8    8    1  3   4   3    7    1    2
3    1017023  4   1    1    3  2   1   3    1    1    2
4    1017122  8  10   10    8  7  10   9    7    1    4
..       ... ..  ..  ...  ... ..  ..  ..  ...  ...  ...
693   776715  3   1    1    1  3   2   1    1    1    2
694   841769  2   1    1    1  2   1   1    1    1    2
695   888820  5  10   10    3  7   3   8   10    2    4
696   897471  4   8    6    4  3   4  10    6    1    4
697   897471  4   8    8    5  4   5  10    4    1    4

[698 rows x 11 columns]>

In [0]:
from sklearn import neighbors, datasets, model_selection

In [0]:
# dataset
cancer = datasets.load_breast_cancer()

In [0]:
y = cancer.target

In [0]:
X = cancer.data

In [0]:
# no arguments specified
knn = neighbors.KNeighborsClassifier()

In [0]:
# 10 folds, scored on precision
cv = model_selection.cross_val_score(knn, X, y, cv=10, scoring='precision')

In [43]:
# precision scores
print(cv)

[0.91666667 0.85       0.91666667 0.94736842 0.94594595 0.94444444
 0.97222222 0.92105263 0.96969697 0.97142857]


In [44]:
# average over all folds
print(round(cv.mean(), 2))

0.94


In [45]:
# k = 15
knn = neighbors.KNeighborsClassifier(n_neighbors=15)

cv = model_selection.cross_val_score(knn, X, y, cv=10, scoring='precision')

print(round(cv.mean(), 2))

0.93


In [46]:
def evaluate_knn(k):
  knn = neighbors.KNeighborsClassifier(n_neighbors=k)
  cv = model_selection.cross_val_score(knn, X, y, cv=10, scoring='precision')
  print(round(cv.mean(), 2))
evaluate_knn(k=7)
evaluate_knn(k=3)
evaluate_knn(k=1)

0.93
0.93
0.92


In [47]:
# k =5, weights evaluated using distance
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance')

cv = model_selection.cross_val_score(knn, X, y, cv=10, scoring='precision')
print(round(cv.mean(), 2))

0.93


We therefore conclude that the default hyperparameterization is the optimal one in this case.

In [0]:
# hyperparameter grid
grid = {
    'k': [1, 3, 5, 7]
}

n the code snippet, we have used a dictionary {} and set the k values in a Python dictionary.

In the next part of the code snippet, to conduct the search, we iterate through the grid, fitting a model for each value of k, each time evaluating the model through 10-fold cross-validation.

At the end of each iteration, we extract, format, and report back the mean precision score after cross-validation via the print method:

In [49]:
# for every value of k in the grid
for k in grid['k']:

    # initialize the knn estimator
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
	
    # conduct a 10-fold cross-validation
    cv = model_selection.cross_val_score(knn, X, y, cv=10, scoring='precision')

    # calculate the average precision value over all folds
    cv_mean = round(cv.mean(), 3)

    # report the result
    print('With k = {}, mean precision = {}'.format(k, cv_mean))

With k = 1, mean precision = 0.919
With k = 3, mean precision = 0.928
With k = 5, mean precision = 0.936
With k = 7, mean precision = 0.931


We can see from the output that k = 5 is the best hyperparameterization found, with a mean precision score of roughly 94%. Increasing k to 7 didn't significantly improve performance. It is important to note that the only parameter we are changing here is k and that each time the k-NN estimator is initialized, it is done with the remaining hyperparameters set to their default values.

To make this point clear, we can run the same loop, this time just printing the hyperparameterization that will be tried:

In [50]:
# for every value of k in the grid
for k in grid['k']:

    # initialize the knn estimator
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)

    # print the hyperparameterization
    print(knn.get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}


You can see from the output that the only parameter we are changing is k; everything else remains the same in each iteration.

In [53]:
# hyperparameter grid
grid = {
    'k': [1, 3, 5, 7],
    'weight_function': ['uniform', 'distance']
}

# for every value of k in the grid
for k in grid['k']:

    # and every possible weight_function in the grid
    for weight_function in grid['weight_function']:

      # initialize the knn estimator
      knn = neighbors.KNeighborsClassifier(n_neighbors=k, weights=weight_function)

      # conduct a 10-fold cross-validation
      cv = model_selection.cross_val_score(knn, X, y, cv=10, scoring='precision')

      # calculate the average precision value over all folds
      cv_mean = round(cv.mean(), 3)

      # report the result
      print('With k = {} and weight function = {}, mean precision = {}'.format(k, weight_function, cv_mean))

With k = 1 and weight function = uniform, mean precision = 0.919
With k = 1 and weight function = distance, mean precision = 0.919
With k = 3 and weight function = uniform, mean precision = 0.928
With k = 3 and weight function = distance, mean precision = 0.929
With k = 5 and weight function = uniform, mean precision = 0.936
With k = 5 and weight function = distance, mean precision = 0.93
With k = 7 and weight function = uniform, mean precision = 0.931
With k = 7 and weight function = distance, mean precision = 0.926


In [54]:
# for every value of k in the grid
for k in grid['k']:

    # and every possible weight_function in the grid
    for weight_function in grid['weight_function']:
  	
      # initialize the knn estimator
      knn = neighbors.KNeighborsClassifier(n_neighbors=k, weights=weight_function)
      
      # print the hyperparameterizations
      print(knn.get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 2, 'weights': 'distance'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n

This implementation, while great for demonstrating how the grid search process works, may not practical when trying to evaluate estimators that have 3, 4, or even 10 different types of hyperparameters, each with a multitude of possible settings.