In [1]:
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn import datasets
from skimage import exposure
import numpy as np
import imutils
import cv2
import scipy
import os
import matplotlib.pyplot as plt

In [2]:
mnist = datasets.load_digits()

In [3]:
mnist.data[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [4]:
def output_png(image, filename):
    out = os.path.join("sample-images", filename+".png")
    scipy.misc.imsave(out, np.array(image).reshape(8,8))
    img = cv2.imread(out)
    res = cv2.resize(img, dsize=(256, 256))
    scipy.misc.imsave(out, np.array(res))

In [5]:
def plot_error(kVal, accuracies):
    plt.ylabel('Validation Error', fontsize=14)
    plt.xlabel('K', fontsize=14)
    plt.plot(list(kVals), accuracies, 'bo--')
    figure = plt.gcf()  # get current figure
    figure.set_size_inches(13, 7)
    plt.clf()

In [6]:
output_png(mnist['images'][0], 'test')

In [39]:
# load the MNIST digits dataset
mnist = datasets.load_digits()

#print (mnist.data)

# Training and testing split,
# 75% for training and 25% for testing
(trainData, testData, trainLabels, testLabels) = train_test_split(np.array(mnist.data), mnist.target, test_size=0.25)

# take 10% of the training data and use that for validation
(trainData, valData, trainLabels, valLabels) = train_test_split(trainData, trainLabels, test_size=0.3)

# Checking sizes of each data split
print("training data points: {}".format(len(trainLabels)))
print("validation data points: {}".format(len(valLabels)))
print("testing data points: {}".format(len(testLabels)))


# initialize the values of k for our k-Nearest Neighbor classifier along with the
# list of accuracies for each value of k
kVals = range(1, 30, 2)
algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute']
metrics = ['euclidean','manhattan','chebyshev','minkowski']
    
# largest accuracy
# np.argmax returns the indices of the maximum values along an axis
i = np.argmax(accuracies)
print("k=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i],
    accuracies[i] * 100))

# Now that I know the best value of k, re-train the classifier
model = KNeighborsClassifier(n_neighbors=kVals[i])
model.fit(trainData, trainLabels)

# Predict labels for the test set
predictions = model.predict(testData)

# Evaluate performance of model for each of the digits
print("EVALUATION ON TESTING DATA")
print(classification_report(testLabels, predictions))

# some indices are classified correctly 100% of the time (precision = 1)
# high accuracy (98%)

training data points: 942
validation data points: 405
testing data points: 450
k=5 achieved highest accuracy of 98.27% on validation data
EVALUATION ON TESTING DATA
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.96      1.00      0.98        48
           2       1.00      0.98      0.99        43
           3       0.97      1.00      0.99        39
           4       1.00      0.93      0.96        42
           5       0.98      0.98      0.98        57
           6       0.97      1.00      0.99        37
           7       0.92      1.00      0.96        44
           8       1.00      0.94      0.97        47
           9       0.96      0.95      0.95        56

    accuracy                           0.98       450
   macro avg       0.98      0.98      0.98       450
weighted avg       0.98      0.98      0.98       450



In [38]:
# loop over kVals
for a in algorithms:
    accuracies = []
    for k in kVals: 
        # train the classifier with the current value of `k`
        model = KNeighborsClassifier(n_neighbors=k, algorithm=a)
        model.fit(trainData, trainLabels)

        # evaluate the model and print the accuracies list
        score = model.score(valData, valLabels)
        print("k=%d, alg=%s, accuracy=%.2f%%" % (k, a, score * 100))
        accuracies.append(score)
    plot_error(kVals, accuracies)

k=1, alg=auto, accuracy=98.02%
k=3, alg=auto, accuracy=98.02%
k=5, alg=auto, accuracy=98.27%
k=7, alg=auto, accuracy=97.53%
k=9, alg=auto, accuracy=97.28%
k=11, alg=auto, accuracy=97.04%
k=13, alg=auto, accuracy=96.54%
k=15, alg=auto, accuracy=96.54%
k=17, alg=auto, accuracy=96.54%
k=19, alg=auto, accuracy=96.54%
k=21, alg=auto, accuracy=96.54%
k=23, alg=auto, accuracy=96.05%
k=25, alg=auto, accuracy=96.05%
k=27, alg=auto, accuracy=95.80%
k=29, alg=auto, accuracy=95.80%
k=1, alg=ball_tree, accuracy=98.02%
k=3, alg=ball_tree, accuracy=98.02%
k=5, alg=ball_tree, accuracy=98.27%
k=7, alg=ball_tree, accuracy=97.28%
k=9, alg=ball_tree, accuracy=97.04%
k=11, alg=ball_tree, accuracy=97.04%
k=13, alg=ball_tree, accuracy=96.54%
k=15, alg=ball_tree, accuracy=96.79%
k=17, alg=ball_tree, accuracy=96.54%
k=19, alg=ball_tree, accuracy=96.54%
k=21, alg=ball_tree, accuracy=96.54%
k=23, alg=ball_tree, accuracy=96.05%
k=25, alg=ball_tree, accuracy=96.05%
k=27, alg=ball_tree, accuracy=95.80%
k=29, alg=ba

<matplotlib.figure.Figure at 0x9872f90>

In [37]:
for m in metrics:
    accuracies = []
    for k in kVals:
        # train the classifier with the current value of `k`
        model = KNeighborsClassifier(n_neighbors=k, metric=m)
        model.fit(trainData, trainLabels)

        # evaluate the model and print the accuracies list
        score = model.score(valData, valLabels)
        print("k=%d, alg=auto, metric=%s, accuracy=%.2f%%" % (k, m, score * 100))
        accuracies.append(score)
    plot_error(kVals, accuracies)

k=1, alg=auto, metric=euclidean, accuracy=98.02%
k=3, alg=auto, metric=euclidean, accuracy=98.02%
k=5, alg=auto, metric=euclidean, accuracy=98.27%
k=7, alg=auto, metric=euclidean, accuracy=97.53%
k=9, alg=auto, metric=euclidean, accuracy=97.28%
k=11, alg=auto, metric=euclidean, accuracy=97.04%
k=13, alg=auto, metric=euclidean, accuracy=96.54%
k=15, alg=auto, metric=euclidean, accuracy=96.54%
k=17, alg=auto, metric=euclidean, accuracy=96.54%
k=19, alg=auto, metric=euclidean, accuracy=96.54%
k=21, alg=auto, metric=euclidean, accuracy=96.54%
k=23, alg=auto, metric=euclidean, accuracy=96.05%
k=25, alg=auto, metric=euclidean, accuracy=96.05%
k=27, alg=auto, metric=euclidean, accuracy=95.80%
k=29, alg=auto, metric=euclidean, accuracy=95.80%
k=1, alg=auto, metric=manhattan, accuracy=98.02%
k=3, alg=auto, metric=manhattan, accuracy=98.27%
k=5, alg=auto, metric=manhattan, accuracy=97.04%
k=7, alg=auto, metric=manhattan, accuracy=96.54%
k=9, alg=auto, metric=manhattan, accuracy=96.54%
k=11, alg=

TypeError: __init__() takes exactly 2 positional arguments (1 given)

In [19]:
# check predictions against images
# loop over a few random digits
indexes = np.random.randint(0, high=len(testData), size=(5,))
images = testData[indexes]
prediction = model.predict(images)

# convert the image for a 64-dim array to an 8 x 8 image compatible with OpenCV,
# then resize it to 32 x 32 pixels for better visualization
for i in range(0, len(indexes)):
    image = images[i].reshape((8, 8)).astype("uint8")
    image = exposure.rescale_intensity(image, out_range=(0, 255))
    image = imutils.resize(image, width=32, inter=cv2.INTER_CUBIC)

    # show the prediction
    print("I think that digit is: {}".format(prediction[i]))
    cv2.imshow("Image", image)

I think that digit is: 8
I think that digit is: 0
I think that digit is: 0
I think that digit is: 0
I think that digit is: 1
