In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [2]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

X_train_d = train.iloc[:, 1:]
y_train_d = train.iloc[:,0]


In [20]:
# now, let's take 10% of the training data and use that for validation
(X_tr, X_test, y_tr, y_test) = train_test_split(X_train_d, y_train_d, test_size=0.3, random_state=84)
(X_train, X_val, y_train, y_val) = train_test_split(X_tr, y_tr, test_size=0.1, random_state=84)

In [4]:
# initialize the values of k for our k-Nearest Neighbor classifier along with the
# list of accuracies for each value of k

kVals = range(1, 30, 2)
accuracies = []

In [5]:
# loop over various values of `k` for the k-Nearest Neighbor classifier

for k in range(1, 30, 2):
          # train the k-Nearest Neighbor classifier with the current value of `k`
          knn_clf = KNeighborsClassifier(n_neighbors=k)
          knn_clf.fit(X_train, y_train)
          # evaluate the model and update the accuracies list
          score = knn_clf.score(X_val, y_val)
          print("k=%d, accuracy=%.2f%%" % (k, score * 100))
          accuracies.append(score)

k=1, accuracy=97.10%
k=3, accuracy=97.19%
k=5, accuracy=97.10%
k=7, accuracy=96.81%
k=9, accuracy=96.74%
k=11, accuracy=96.67%
k=13, accuracy=96.38%
k=15, accuracy=96.24%
k=17, accuracy=96.14%
k=19, accuracy=96.10%
k=21, accuracy=95.93%
k=23, accuracy=95.74%
k=25, accuracy=95.64%
k=27, accuracy=95.57%
k=29, accuracy=95.38%


In [6]:
# find the value of k that has the largest accuracy

i = np.argmax(accuracies)
print("k=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i],
accuracies[i] * 100))

k=3 achieved highest accuracy of 97.19% on validation data


In [21]:
# re-train our classifier using the best k value and predict the labels of the
# test data

knn_clf_bestK = KNeighborsClassifier(n_neighbors=kVals[i])
knn_clf_bestK.fit(X_train, y_train)
y_pred = knn_clf_bestK.predict(X_test)

# show a final classification report demonstrating the accuracy of the classifier
# for each of the digits

print("EVALUATION ON TESTING DATA")
print(classification_report(y_test, y_pred))

print ("Confusion matrix")
print(confusion_matrix(y_test, y_pred))

EVALUATION ON TESTING DATA
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1224
           1       0.95      1.00      0.97      1411
           2       0.98      0.95      0.97      1262
           3       0.96      0.96      0.96      1314
           4       0.96      0.96      0.96      1184
           5       0.96      0.96      0.96      1147
           6       0.97      0.99      0.98      1202
           7       0.95      0.97      0.96      1351
           8       0.99      0.91      0.95      1245
           9       0.95      0.95      0.95      1260

   micro avg       0.96      0.96      0.96     12600
   macro avg       0.97      0.96      0.96     12600
weighted avg       0.97      0.96      0.96     12600

Confusion matrix
[[1219    0    0    1    0    0    4    0    0    0]
 [   0 1404    0    1    1    0    2    1    0    2]
 [   9   16 1200    3    0    1    1   25    3    4]
 [   1    4    6 1266    2   13    1  

In [34]:
subm_pred = knn_clf_bestK.predict(test)


In [33]:
submissions=pd.DataFrame({"ImageId": list(range(1,len(subm_pred)+1)), "Label": subm_pred})
# Generate csv file
submissions.to_csv("submission.csv", index=False, header=True)


In [32]:
subm_pred


array([2, 0, 9, ..., 3, 9, 2], dtype=int64)