# Digit Recognition Using (K-Nearest Neighbor) KNN Classifier

In [3]:
# importing all the necessary libraries
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn import datasets
from skimage import exposure
from sklearn import metrics

import numpy as np
import pandas as pd
import math
import operator

# Reading Original MNIST dataset

In [4]:
# load the original MNIST digits dataset
train_data = pd.read_csv('./Dataset/train.csv', header=None)
train_labels = pd.read_csv('./Dataset/train_labels.csv', header=None)

# check data shape
print(train_data.shape)
print(train_labels.shape)

(60000, 784)
(60000, 1)


# Splitting the dataset for training, testing and validation

In [3]:
# Training and testing split,
# 75% for training and 25% for testing
X = train_data.copy()
y = train_labels.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

# normalize the dataset
X_train = X_train / 255.0
X_test = X_test / 255.0

# take 10% of the training data and use that for validation
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.10, random_state=84)

In [4]:
# checking the splits
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_validate shape:', X_validate.shape)
print('y_validate shape:', y_validate.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (40500, 784)
y_train shape: (40500, 1)
X_validate shape: (4500, 784)
y_validate shape: (4500, 1)
X_test shape: (15000, 784)
y_test shape: (15000, 1)


# Determining the best value of 'k' as it is a hyper-parameter

In [5]:
# list of accuracies for each value of k
kVals = range(1, 8, 1)
accuracies = []

# loop over kVals
for k in kVals:
# train the classifier with the current value of `k`
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train.values.ravel())
# evaluate the model and print the accuracies list
    score = model.score(X_validate, y_validate)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    accuracies.append(score)

# k with largest accuracy will be chosen for final training
# np.argmax returns the indices of the maximum values along an axis
i = np.argmax(accuracies)

print("\nk=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i], accuracies[i] * 100))

k=1, accuracy=96.87%
k=2, accuracy=96.51%
k=3, accuracy=97.02%
k=4, accuracy=96.76%
k=5, accuracy=97.02%
k=6, accuracy=96.62%
k=7, accuracy=96.71%

k=3 achieved highest accuracy of 97.02% on validation data


# Building and Evaluating the model using the best value of k

In [6]:
# Now that I know the best value of k, re-train the classifier
model = KNeighborsClassifier(n_neighbors=kVals[i])

# train the model again
model.fit(X_train, y_train.values.ravel())

# Predict labels for the test set
predictions = model.predict(X_test)

# accuracy
print("Accuracy:", metrics.accuracy_score(y_true=y_test.values.ravel(), y_pred=predictions), "\n")

Accuracy: 0.9698 



In [7]:
# Evaluate performance of model for each of the digits
print("Evaluating on test data: ")
print(classification_report(y_test.values.ravel(), predictions))

Evaluating on test data: 
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1495
           1       0.96      0.99      0.98      1649
           2       0.97      0.96      0.97      1471
           3       0.97      0.96      0.97      1518
           4       0.97      0.97      0.97      1443
           5       0.96      0.96      0.96      1383
           6       0.98      0.99      0.98      1482
           7       0.97      0.98      0.97      1635
           8       0.99      0.93      0.96      1445
           9       0.96      0.96      0.96      1479

    accuracy                           0.97     15000
   macro avg       0.97      0.97      0.97     15000
weighted avg       0.97      0.97      0.97     15000



In [8]:
# dumping the trained model for later use.
joblib.dump(model, 'digit-classifier_knn.joblib')

['digit-classifier_knn.joblib']

# Testing the final model on an unseen dataset

In [9]:
# load the originial MNIST test data set
test_data = pd.read_csv('./Dataset/test.csv', header=None)
test_labels = pd.read_csv('./Dataset/test_labels.csv', header=None)

# normalize test data
test_data = test_data / 255.0

# check data shape
print(test_data.shape)
print(test_labels.shape)

(10000, 784)
(10000, 1)


In [10]:
# Load the trained classifier
model = joblib.load("digit-classifier_knn.joblib")

# make predictions
predictions = model.predict(test_data)

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=test_labels.values.ravel(), y_pred=predictions), "\n")

accuracy: 0.9679 



In [11]:
# Evaluate performance of model for each of the digits
print("Evaluating on test data: ")
print(classification_report(test_labels.values.ravel(), predictions))

Evaluating on test data: 
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.95      1.00      0.98      1135
           2       0.98      0.96      0.97      1032
           3       0.96      0.97      0.96      1010
           4       0.97      0.97      0.97       982
           5       0.96      0.97      0.96       892
           6       0.98      0.98      0.98       958
           7       0.96      0.96      0.96      1028
           8       0.99      0.93      0.96       974
           9       0.96      0.95      0.95      1009

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000



# Testing the final model for kaggle competition

In [12]:
# read kaggle test data
test_data = pd.read_csv('test.csv')

# normalize test data
test_data = test_data / 255.0

print(test_data.shape)

(28000, 784)


In [None]:
# loading the trained model and testing it for kaggle submission
model = joblib.load('digit-classifier_knn.joblib')

# make predictions
y_predict = model.predict(test_data)

# preparing data for kaggle submission
test_ids = []
pred_labels = []

for k in range(len(y_predict)):
    test_ids.append(k+1)
    
for k in range(len(y_predict)):
    pred_labels.append(y_predict[k])
    
df = pd.DataFrame(list(zip(test_ids,pred_labels)),
                 columns =['ImageId', 'Label'])

# create submission file
df.to_csv('submission_knn.csv', index=False)