### Goal: Reach over 97% accuracy with the mnist dataset.

### Importing modules.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.ndimage import shift
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

MNIST dataset contains images, and DataFrames aren't ideal for that, so it's preferable to set as_fame=False to get the data as Numpy arrays instead.

In [None]:
mnist = fetch_openml("mnist_784", as_frame=False)

In [None]:
X, y = mnist.data, mnist.target

In [None]:
print(X.shape)
print(y.shape)

Let's take a peek at one digit from the dataset.

In [None]:
def plot_digit(image_data):
    image = image_data.reshape(28, 28)
    plt.imshow(image, cmap="binary")
    plt.axis("off")

some_digit = X[12]
plot_digit(some_digit)
plt.show()
y[12]

### Spliting the dataset.

The MNIST dataset returned by fetch_openml() is already split into training set(the first 60,000) and a test set(the last 10,000 images).
The training set is already shuffled for us, which is good because this guarantees that all cross-validation folds will be similar so they won't preform poorly due to similar instances.(~skewed dataset)

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

First 100 digits of the dataset to get more sense of the data that we are dealing with.

In [None]:
plt.figure(figsize=(9, 9))
for idx, image_data in enumerate(X[:100]):
    plt.subplot(10, 10, idx + 1)
    plot_digit(image_data)
plt.subplots_adjust(wspace=0, hspace=0)

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
baseline_accuracy = knn_clf.score(X_test, y_test)
baseline_accuracy

I want to tune the hyperparameters.To speed up the search, let's train only on the first 10,000 images:

In [None]:
param_grid = [{"weights": ["uniform", "distance"], 'n_neighbors': [3, 4, 5, 6]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5)
grid_search.fit(X_train[:10_000], y_train[:10_000])

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

The score dropped since I only trained the model on 10,000 images.So I will take the best model and train it again on the full training set:

In [None]:
grid_search.best_estimator_.fit(X_train, y_train)
y_predict = grid_search.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_predict)
cm

Each row in a confusion matrix represents an actuall class, while each column represents a predicted class.

In [None]:
precision_score(y_test, y_predict, average="weighted")

In [None]:
recall_score(y_test, y_predict, average="weighted")

In [None]:
f1_score(y_test, y_predict, average="weighted")

# Data augmentation
Let's see what happens if we augment the data by shifting it one pixel.

In [None]:
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

In [None]:
image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)

plt.figure(figsize=(12, 3))
plt.subplot(131)
plt.title("Original")
plt.imshow(image.reshape(28, 28),
           interpolation="nearest", cmap="Greys")
plt.subplot(132)
plt.title("Shifted down")
plt.imshow(shifted_image_down.reshape(28, 28),
           interpolation="nearest", cmap="Greys")
plt.subplot(133)
plt.title("Shifted left")
plt.imshow(shifted_image_left.reshape(28, 28),
           interpolation="nearest", cmap="Greys")
plt.show()

It is working. So I will create an augmented training set by shifting every image left, right, up and down one pixel:

In [None]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((-1, 0), (1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)
        
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

If the augmented training set is not shuffled, all shifted images will be grouped together:

In [None]:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [None]:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)

In [None]:
knn_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
precision_score(y_test, y_predict, average="weighted")
recall_score(y_test, y_predict, average="weighted")
f1_score(y_test, y_predict, average="weighted")

In [None]:
augmented_accuracy = knn_clf.score(X_test, y_test)
augmented_accuracy

In [None]:
error_rate_change = (1 - augmented_accuracy) / (1 - tuned_accuracy) - 1
print(f"error_rate_change = {error_rate_change:.0%}")

Nice. Error rate drops quite a bit.