In [1]:
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [4]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(60000, 784)
(10000, 784)
(60000,)
(10000,)


In [5]:
X_train = StandardScaler().fit_transform(X_train)

In [6]:
kn = KNeighborsClassifier(n_jobs=-1)
kn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': [3, 5, 7]
}

grid = GridSearchCV(kn, param_grid)

In [20]:
grid.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [3, 5, 7],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [21]:
best_kn = grid.best_estimator_

In [22]:
grid.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

In [23]:
grid.best_score_

0.9421

In [24]:
predict = best_kn.predict(X_test)

In [25]:
correct = sum(predict == y_test)
total = len(y_test)
print(correct, total)

7983 10000


In [13]:
predict[:10]

array(['7', '2', '1', '0', '4', '1', '8', '9', '6', '9'], dtype=object)

In [14]:
y_test[:10]

array(['7', '2', '1', '0', '4', '1', '4', '9', '5', '9'], dtype=object)