This is trying out KNN classifiers for the MNIST data set

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
digits = datasets.load_digits()

n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

X_train, X_test, y_train, y_test = train_test_split(digits.images, digits.target, \
                                                    test_size=.4, random_state=0)

In [17]:
# defaults to 5 neighbors for now
clf = KNeighborsClassifier()
X_train = X_train.reshape(X_train.shape[0],64)
X_test = X_test.reshape(X_test.shape[0],64)

In [18]:
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [19]:
# we get about 98% with just vanilla 5-NN classification
expected = y_test
predicted = clf.predict(X_test)
print("Classification report for classifier %s:\n%s\n"
      % (clf, classification_report(expected, predicted, digits=5)))
print("Confusion matrix:\n%s" % confusion_matrix(expected, predicted))

Classification report for classifier KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'):
             precision    recall  f1-score   support

          0    1.00000   1.00000   1.00000        60
          1    0.93590   1.00000   0.96689        73
          2    0.98592   0.98592   0.98592        71
          3    0.95833   0.98571   0.97183        70
          4    1.00000   0.98413   0.99200        63
          5    0.98864   0.97753   0.98305        89
          6    0.98701   1.00000   0.99346        76
          7    0.97015   1.00000   0.98485        65
          8    1.00000   0.89744   0.94595        78
          9    0.97297   0.97297   0.97297        74

avg / total    0.97978   0.97914   0.97900       719


Confusion matrix:
[[60  0  0  0  0  0  0  0  0  0]
 [ 0 73  0  0  0  0  0  0  0  0]
 [ 0  0 70  0  0  0  0  1  0  0]
 [ 0  0  1 69  0  0  0  0  0  0]
 [ 0  0 

In [20]:
# look at all the parameters associated to KNN
print clf.get_params()


{'n_neighbors': 5, 'n_jobs': 1, 'algorithm': 'auto', 'metric': 'minkowski', 'metric_params': None, 'p': 2, 'weights': 'uniform', 'leaf_size': 30}


In [21]:
# now lets try some hyperparameter tuning
param_grid = [
  {'n_neighbors': [2, 4, 8, 16], 'weights': ['uniform', 'distance']},
  {'n_neighbors': [2, 4, 8, 16], 'weights': ['uniform', 'distance'], 'p': [1, 2]},
 ]

In [22]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print '\n'

    clf = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print '\n'
    print(clf.best_params_)
    print '\n'
    print("Grid scores on development set:")
    print '\n'
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.5f (+/-%0.05f) for %r"
              % (mean, std * 2, params))
    print '\n'

    print("Detailed classification report:")
    print '\n'
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print '\n'
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred, digits=5))
    print '\n'

# Tuning hyper-parameters for precision


Best parameters set found on development set:


{'n_neighbors': 2, 'weights': 'distance'}


Grid scores on development set:


0.97796 (+/-0.02160) for {'n_neighbors': 2, 'weights': 'uniform'}
0.98817 (+/-0.02115) for {'n_neighbors': 2, 'weights': 'distance'}
0.98271 (+/-0.02461) for {'n_neighbors': 4, 'weights': 'uniform'}
0.98520 (+/-0.02394) for {'n_neighbors': 4, 'weights': 'distance'}
0.97733 (+/-0.01968) for {'n_neighbors': 8, 'weights': 'uniform'}
0.97719 (+/-0.01954) for {'n_neighbors': 8, 'weights': 'distance'}
0.96284 (+/-0.03019) for {'n_neighbors': 16, 'weights': 'uniform'}
0.96975 (+/-0.02753) for {'n_neighbors': 16, 'weights': 'distance'}
0.96927 (+/-0.01807) for {'n_neighbors': 2, 'weights': 'uniform', 'p': 1}
0.97708 (+/-0.01954) for {'n_neighbors': 2, 'weights': 'distance', 'p': 1}
0.97796 (+/-0.02160) for {'n_neighbors': 2, 'weights': 'uniform', 'p': 2}
0.98817 (+/-0.02115) for {'n_neighbors': 2, 'weights': 'distance', 'p': 2}
