Grid Searches
=================

Grid-Search with build-in cross validation

In [4]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC

In [5]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

Define parameter grid:

In [6]:
import numpy as np

param_grid = {'C': 10. ** np.arange(-3, 3),
              'gamma' : 10. ** np.arange(-5, 0)}
              

np.set_printoptions(suppress=True)
print(param_grid)

{'gamma': array([ 0.00001,  0.0001 ,  0.001  ,  0.01   ,  0.1    ]), 'C': array([   0.001,    0.01 ,    0.1  ,    1.   ,   10.   ,  100.   ])}


In [8]:
grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5)

A GridSearchCV object behaves just like a normal classifier.

In [9]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] gamma=1e-05, C=0.001 ............................................
[CV] ................... gamma=1e-05, C=0.001, score=0.105839 -   0.3s
[CV] gamma=1e-05, C=0.001 ............................................
[CV] ................... gamma=1e-05, C=0.001, score=0.209559 -   0.3s
[CV] gamma=1e-05, C=0.001 ............................................
[CV] ................... gamma=1e-05, C=0.001, score=0.104089 -   0.3s
[CV] gamma=1e-05, C=0.001 ............................................
[CV] ................... gamma=1e-05, C=0.001, score=0.104869 -   0.3s
[CV] gamma=1e-05, C=0.001 ............................................
[CV] ................... gamma=1e-05, C=0.001, score=0.105660 -   0.3s
[CV] gamma=0.0001, C=0.001 ...........................................
[CV] .................. gamma=0.0001, C=0.001, score=0.105839 -   0.3s
[CV] gamma=0.0001, C=0.001 ...........................................
[CV] ..........

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:    9.1s


[CV] ................... gamma=0.0001, C=0.01, score=0.209559 -   0.3s
[CV] gamma=0.0001, C=0.01 ............................................
[CV] ................... gamma=0.0001, C=0.01, score=0.104089 -   0.3s
[CV] gamma=0.0001, C=0.01 ............................................
[CV] ................... gamma=0.0001, C=0.01, score=0.104869 -   0.3s
[CV] gamma=0.0001, C=0.01 ............................................
[CV] ................... gamma=0.0001, C=0.01, score=0.105660 -   0.3s
[CV] gamma=0.001, C=0.01 .............................................
[CV] .................... gamma=0.001, C=0.01, score=0.105839 -   0.3s
[CV] gamma=0.001, C=0.01 .............................................
[CV] .................... gamma=0.001, C=0.01, score=0.209559 -   0.3s
[CV] gamma=0.001, C=0.01 .............................................
[CV] .................... gamma=0.001, C=0.01, score=0.104089 -   0.3s
[CV] gamma=0.001, C=0.01 .............................................
[CV] .

[Parallel(n_jobs=1)]: Done 127 tasks       | elapsed:   31.3s


[CV] ................... gamma=1e-05, C=100.0, score=0.970037 -   0.1s
[CV] gamma=1e-05, C=100.0 ............................................
[CV] ................... gamma=1e-05, C=100.0, score=0.984906 -   0.1s
[CV] gamma=0.0001, C=100.0 ...........................................
[CV] .................. gamma=0.0001, C=100.0, score=0.992701 -   0.1s
[CV] gamma=0.0001, C=100.0 ...........................................
[CV] .................. gamma=0.0001, C=100.0, score=0.974265 -   0.1s
[CV] gamma=0.0001, C=100.0 ...........................................
[CV] .................. gamma=0.0001, C=100.0, score=0.992565 -   0.1s
[CV] gamma=0.0001, C=100.0 ...........................................
[CV] .................. gamma=0.0001, C=100.0, score=0.962547 -   0.1s
[CV] gamma=0.0001, C=100.0 ...........................................
[CV] .................. gamma=0.0001, C=100.0, score=0.984906 -   0.1s
[CV] gamma=0.001, C=100.0 ............................................
[CV] .

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   35.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gamma': array([ 0.00001,  0.0001 ,  0.001  ,  0.01   ,  0.1    ]), 'C': array([   0.001,    0.01 ,    0.1  ,    1.   ,   10.   ,  100.   ])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [10]:
grid_search.predict(X_test)

array([3, 3, 0, 3, 5, 7, 5, 8, 2, 9, 0, 6, 8, 6, 9, 4, 7, 3, 8, 5, 2, 3, 7,
       8, 7, 5, 6, 3, 8, 7, 2, 3, 7, 9, 7, 8, 1, 3, 8, 8, 5, 5, 2, 9, 1, 4,
       8, 4, 5, 6, 9, 5, 8, 9, 9, 5, 0, 5, 7, 8, 7, 5, 4, 7, 4, 7, 1, 3, 3,
       7, 3, 5, 9, 0, 5, 4, 4, 9, 4, 9, 1, 3, 2, 9, 2, 9, 1, 0, 6, 4, 7, 9,
       8, 2, 8, 2, 1, 2, 4, 9, 9, 5, 4, 9, 7, 8, 1, 5, 4, 0, 6, 9, 7, 5, 2,
       6, 5, 9, 3, 2, 3, 6, 0, 0, 9, 4, 6, 9, 7, 5, 3, 1, 5, 8, 8, 0, 0, 9,
       1, 8, 4, 1, 5, 6, 6, 4, 9, 5, 9, 0, 0, 1, 0, 6, 3, 1, 4, 6, 0, 7, 2,
       2, 5, 3, 3, 7, 8, 8, 5, 1, 9, 3, 4, 1, 9, 3, 4, 9, 7, 7, 9, 9, 5, 6,
       1, 0, 3, 6, 5, 0, 6, 0, 4, 3, 8, 1, 4, 7, 1, 9, 8, 1, 8, 0, 2, 1, 6,
       9, 1, 4, 5, 5, 4, 0, 6, 1, 4, 3, 3, 8, 2, 2, 7, 0, 6, 9, 2, 0, 5, 7,
       7, 9, 5, 0, 3, 7, 2, 2, 2, 0, 1, 8, 4, 7, 6, 7, 2, 9, 4, 0, 9, 7, 0,
       0, 7, 6, 8, 1, 7, 5, 5, 9, 5, 5, 4, 0, 5, 4, 7, 4, 6, 4, 9, 5, 3, 7,
       2, 8, 9, 4, 6, 2, 9, 6, 0, 5, 4, 6, 4, 6, 7, 3, 4, 6, 1, 2, 9, 1, 2,
       8, 8,

In [13]:
grid_search.score(X_test, y_test)
#np.mean(grid_search.predict(X_test)==y_test)

0.99333333333333329

In [14]:
grid_search.best_params_

{'C': 10.0, 'gamma': 0.001}

In [15]:
# We extract just the scores
%matplotlib notebook
import matplotlib.pyplot as plt

scores = [x[1] for x in grid_search.grid_scores_]
scores = np.array(scores).reshape(6, 5)

plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(5), param_grid['gamma'])
plt.yticks(np.arange(6), param_grid['C']);

<IPython.core.display.Javascript object>

# Exercises
Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier.

In [32]:
# %load solutions/grid_search_k_neighbors.py
from sklearn.neighbors import KNeighborsClassifier
param_grid = {'n_neighbors': [100, 50, 20, 10, 9, 7, 5, 3, 1]}
print(param_grid)
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=3, cv=5)

{'n_neighbors': [100, 50, 20, 10, 9, 7, 5, 3, 1]}


In [33]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] n_neighbors=100 .................................................
[CV] ........................ n_neighbors=100, score=0.883212 -   0.0s
[CV] n_neighbors=100 .................................................
[CV] ........................ n_neighbors=100, score=0.915441 -   0.0s
[CV] n_neighbors=100 .................................................
[CV] ........................ n_neighbors=100, score=0.918216 -   0.0s
[CV] n_neighbors=100 .................................................
[CV] ........................ n_neighbors=100, score=0.913858 -   0.0s
[CV] n_neighbors=100 .................................................
[CV] ........................ n_neighbors=100, score=0.890566 -   0.0s
[CV] n_neighbors=50 ..................................................
[CV] ......................... n_neighbors=50, score=0.919708 -   0.0s
[CV] n_neighbors=50 ..................................................
[CV] ............

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:    1.2s


[CV] .......................... n_neighbors=5, score=0.988679 -   0.0s
[CV] n_neighbors=3 ...................................................
[CV] .......................... n_neighbors=3, score=0.978102 -   0.0s
[CV] n_neighbors=3 ...................................................
[CV] .......................... n_neighbors=3, score=0.966912 -   0.0s
[CV] n_neighbors=3 ...................................................
[CV] .......................... n_neighbors=3, score=0.985130 -   0.0s
[CV] n_neighbors=3 ...................................................
[CV] .......................... n_neighbors=3, score=0.973783 -   0.0s
[CV] n_neighbors=3 ...................................................
[CV] .......................... n_neighbors=3, score=0.984906 -   0.0s
[CV] n_neighbors=1 ...................................................
[CV] .......................... n_neighbors=1, score=0.985401 -   0.0s
[CV] n_neighbors=1 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    1.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [100, 50, 20, 10, 9, 7, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [34]:
grid_search.score(X_test, y_test)

0.98888888888888893

In [35]:
grid_search.best_params_

{'n_neighbors': 1}

In [37]:
# %load solutions/grid_search_k_neighbors.py
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors': [1, 3, 5, 7, 10]}

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid)
grid.fit(X_train, y_train)
print("best parameters: %s" % grid.best_params_)
print("Training set accuracy: %s" % grid.score(X_train, y_train))
print("Test set accuracy: %s" % grid.score(X_test, y_test))


best parameters: {'n_neighbors': 1}
Training set accuracy: 1.0
Test set accuracy: 0.988888888889


In [39]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [40]:
np.unique(y_test)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])