In [None]:
# KNN model with PCA

# Import libraries
import numpy as np
import operator 
from operator import itemgetter
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# The Euclidean Distance between two points
def euc_dist(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

# KNN class
class KNN:
    def __init__(self, K=3):
        self.K = K
    def fit(self, x_train, y_train):
        self.X_train = x_train
        self.Y_train = y_train

    def predict(self, X_test):
        predictions = [] 
        for i in range(len(X_test)):
            dist = np.array([euc_dist(X_test[i], x_t) for x_t in   
            self.X_train])
            dist_sorted = dist.argsort()[:self.K]
            neigh_count = {}
            for idx in dist_sorted:
                if self.Y_train[idx] in neigh_count:
                    neigh_count[self.Y_train[idx]] += 1
                else:
                    neigh_count[self.Y_train[idx]] = 1
            sorted_neigh_count = sorted(neigh_count.items(),    
            key = operator.itemgetter(1), reverse=True)
            predictions.append(sorted_neigh_count[0][0]) 
        return predictions

In [None]:
# Create the needed format for KNN
# Import libraries 
import keras
from keras.datasets import mnist

# Import datasets
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train = x_train.reshape(-1, 28*28)/255.0
x_test = x_test.reshape(-1, 28*28)/255.0

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

# PCA Tranformation
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(x_train)
# Apply transform to both the training set and the test set.
train_img = scaler.transform(x_train)
test_img = scaler.transform(x_test)

pca = PCA(0.95)
pca.fit(train_img)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

print("original shape:   ", x_train.shape)
print("transformed shape:", x_train_pca.shape)
print("original shape:   ", x_test.shape)
print("transformed shape:", x_test_pca.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)
original shape:    (60000, 784)
transformed shape: (60000, 331)
original shape:    (10000, 784)
transformed shape: (10000, 331)


In [None]:
def main():

    testDataRange = 2000

    print("X_train dataset shape: " + str(x_train_pca.shape) + "\t| y_train dataset shape: "  + str(y_train.shape))
    print("X_test dataset shape: " + str(x_test_pca.shape) + "\t| y_test dataset shape: "  + str(y_test.shape))

    kVals = [1, 3]
    for k in kVals:
        model = KNN(K = k)
        model.fit(x_train_pca, y_train)
        x_test_mini = x_test_pca[0:testDataRange,:]
        y_test_mini = y_test[0:testDataRange,]
        print("\nChanged test dataset shape")
        print("New X_test dataset shape: " + str(x_test_mini.shape) + "\t| New y_test dataset shape: "  + str(y_test_mini.shape))
        pred = model.predict(x_test_mini)
        accuracy = accuracy_score(y_test_mini, pred)
        print("K = " + str(k)+" | Accuracy: " + str(accuracy))

if __name__ == "__main__":
    main()

X_train dataset shape: (60000, 331)	| y_train dataset shape: (60000,)
X_test dataset shape: (10000, 331)	| y_test dataset shape: (10000,)

Changed test dataset shape
New X_test dataset shape: (2000, 331)	| New y_test dataset shape: (2000,)
K = 1 | Accuracy: 0.9625

Changed test dataset shape
New X_test dataset shape: (2000, 331)	| New y_test dataset shape: (2000,)
K = 3 | Accuracy: 0.964
