# MNIST Baseline

In this notebook, we create a baseline model to predict labels on the MNIST data set.

## Import packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from random import shuffle
import random
random.seed(123)

## Define constants

In [2]:
IMAGE_SIZE = (28, 28)

## Load data

We load both the train set and test set as Pandas data frames.

In [3]:
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

train_features = train_data.drop('label', axis=1)
train_label = train_data['label']

## Data normalization

In [4]:
# Normalizing the images array to be in the range of 0-1 by dividing them by the max possible value.
train_features = train_features / 255.0
test_data = test_data / 255.0

## Fit KNN model

In [5]:
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(train_features, train_label)

KNeighborsClassifier(n_neighbors=1)

## Evaluate KNN model

In [6]:
test_label_prediction = clf.predict(test_data)

In [7]:
submission = pd.DataFrame({'Label':test_label_prediction})
submission.index += 1

In [8]:
submission

Unnamed: 0,Label
1,2
2,0
3,9
4,0
5,3
...,...
27996,9
27997,7
27998,3
27999,9


In [9]:
submission.to_csv('submission.csv', index=True, index_label='ImageId')

## Split the  traindata into two subsets and normalize the features of samples¶

In [10]:
train_data.shape

(42000, 785)

In [11]:
# Check for possible test sizes for 100-fold cross validation
folds = 100
possible_test_sizes = [round(1 - x/train_data.shape[0], ndigits=4) for x in range(30000, 420000, folds)]

# Print only positive test_size value
test_size = list(filter(lambda x: (x >= 0), possible_test_sizes))
print(test_size)

# Chose 0.2143 as the test size
print('\n\033[1mI choose 0.2857 as the test size')

[0.2857, 0.2833, 0.281, 0.2786, 0.2762, 0.2738, 0.2714, 0.269, 0.2667, 0.2643, 0.2619, 0.2595, 0.2571, 0.2548, 0.2524, 0.25, 0.2476, 0.2452, 0.2429, 0.2405, 0.2381, 0.2357, 0.2333, 0.231, 0.2286, 0.2262, 0.2238, 0.2214, 0.219, 0.2167, 0.2143, 0.2119, 0.2095, 0.2071, 0.2048, 0.2024, 0.2, 0.1976, 0.1952, 0.1929, 0.1905, 0.1881, 0.1857, 0.1833, 0.181, 0.1786, 0.1762, 0.1738, 0.1714, 0.169, 0.1667, 0.1643, 0.1619, 0.1595, 0.1571, 0.1548, 0.1524, 0.15, 0.1476, 0.1452, 0.1429, 0.1405, 0.1381, 0.1357, 0.1333, 0.131, 0.1286, 0.1262, 0.1238, 0.1214, 0.119, 0.1167, 0.1143, 0.1119, 0.1095, 0.1071, 0.1048, 0.1024, 0.1, 0.0976, 0.0952, 0.0929, 0.0905, 0.0881, 0.0857, 0.0833, 0.081, 0.0786, 0.0762, 0.0738, 0.0714, 0.069, 0.0667, 0.0643, 0.0619, 0.0595, 0.0571, 0.0548, 0.0524, 0.05, 0.0476, 0.0452, 0.0429, 0.0405, 0.0381, 0.0357, 0.0333, 0.031, 0.0286, 0.0262, 0.0238, 0.0214, 0.019, 0.0167, 0.0143, 0.0119, 0.0095, 0.0071, 0.0048, 0.0024, 0.0]

[1mI choose 0.2857 as the test size


In [12]:
# Split the dataset into the train_val set and testing set

# train_features = train_data.drop('label', axis=1)
# train_label = train_data['label']

X_train, X_test, y_train, y_test = train_test_split(train_features, train_label , 
                                                    test_size = 0.2857,
                                                    random_state = 0)
print(f'samples: {train_data.shape[0]}, train_val: {X_train.shape[0]}, test: {X_test.shape[0]}')

samples: 42000, train_val: 30000, test: 12000


In [13]:
# Normalize features
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

## Train the KNN model and select the hyperparameter with cross-validation¶

In [14]:
k_range = range(1,10)
param_grid = dict(n_neighbors=k_range)
print(param_grid)

{'n_neighbors': range(1, 10)}


In [15]:
clf_knn = KNeighborsClassifier(n_neighbors=1)

In [16]:
grid = GridSearchCV(clf_knn, param_grid, cv=2, scoring='accuracy', n_jobs = -1)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

0.9235333333333333
{'n_neighbors': 3}


## Evaluate model

In [17]:
clf_knn =  KNeighborsClassifier(n_neighbors = grid.best_params_['n_neighbors'])
clf_knn.fit(X_train, y_train)

# Evaluate the model on the testing set
knn_predict = clf_knn.predict(X_test)

In [18]:
# Report prediction
print(classification_report(y_test,knn_predict))

knn_acc = accuracy_score(knn_predict, y_test)
print("\n\033[31;1;48;5;226mK-Nearest Neighbour Accuracy: \033[1m {:.2f}%".format(knn_acc*100))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1171
           1       0.95      0.98      0.97      1371
           2       0.94      0.93      0.94      1228
           3       0.92      0.93      0.92      1229
           4       0.94      0.92      0.93      1171
           5       0.92      0.92      0.92      1071
           6       0.95      0.98      0.96      1177
           7       0.93      0.93      0.93      1284
           8       0.97      0.87      0.92      1143
           9       0.90      0.92      0.91      1155

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000


[31;1;48;5;226mK-Nearest Neighbour Accuracy: [1m 93.73%


In [19]:
knn_predict_new = clf.predict(test_data)

In [20]:
submission = pd.DataFrame({'Label': knn_predict_new})
submission.index += 1

In [21]:
submission

Unnamed: 0,Label
1,2
2,0
3,9
4,0
5,3
...,...
27996,9
27997,7
27998,3
27999,9


In [22]:
submission.to_csv('submission.csv', index=True, index_label='ImageId')