# Importing Libraries

In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_openml
import pandas as pd

# Download MNIST & Create Dataframe

In [2]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

# Evaluate the Data

In [3]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

In [5]:
type(X)

numpy.ndarray

In [6]:
type(y)

numpy.ndarray

In [7]:
X[:5,:]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Training a KNeighborsClassifier with over 97% accuracy 
The MNIST dataset is already split into a training set (the first 60,000 images) and a test set (the last 10,000 images). I will use the GridSearchCV to perform hyperparameter tuning in order to determine the optimal value for the KNeighborsClassifier, a supervised learning classifier, which uses proximity to make classifications or predictions about the grouping of an individual data point.

In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [9]:
grid_params = {'n_neighbors':[3,5,7], 'weights': ['uniform', 'distance']}

In [10]:
gs = GridSearchCV(KNeighborsClassifier(), grid_params, cv = 3, verbose =3, n_jobs = -1)
gs_results = gs.fit(X_train, y_train)
gs_results

Fitting 3 folds for each of 6 candidates, totalling 18 fits


GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [3, 5, 7],
                         'weights': ['uniform', 'distance']},
             verbose=3)

# Evaluating scores, estimators, and parameters.

In [11]:
gs_results.best_score_

0.9693333333333333

In [12]:
gs_results.best_estimator_

KNeighborsClassifier(n_neighbors=3, weights='distance')

In [13]:
gs_results.best_params_

{'n_neighbors': 3, 'weights': 'distance'}

My first run gave me a 96.9% accuracy! The best estimators were n_neighbors of 3 and weights of distance. Therefore, for the next training session, I will maintain the weights parameter and change my n_neighbors from 3,5,7 to 2,3,4. My understanding is that I should stay away from even numbers as n_neighbors because if there is a tie vote then the decision regarding class will be done randomly. However, I believe this is only a concern when weights is set to uniform vs. distance. Therefore, it should not be a concern and it will  make the next training session more efficient.

In [14]:
grid_params = {'n_neighbors':[2,3,4], 'weights': ['distance']}

In [15]:
gs = GridSearchCV(KNeighborsClassifier(), grid_params, cv = 3, verbose =3, n_jobs = -1)
gs_results = gs.fit(X_train, y_train)
gs_results

Fitting 3 folds for each of 3 candidates, totalling 9 fits


GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [2, 3, 4], 'weights': ['distance']},
             verbose=3)

In [16]:
gs_results.best_score_

0.9703500000000002

In [17]:
gs_results.best_estimator_

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [18]:
gs_results.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

On the second run, I was able to achieve a score of 97.04%! I do not think I can improve upon this score. Therefore, I will now use this algorithm on the test set.

In [19]:
y_prediction=gs.predict(X_test)
accuracy_score(y_test, y_prediction)

0.9714

My accuracy on the testing data is 97.14%!