In [1]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [5]:
x_train_flat = x_train.reshape(60000, 28*28)
x_test_flat = x_test.reshape(10000, 28*28)
x_train_flat.shape, y_train.shape, x_test_flat.shape, y_test.shape

((60000, 784), (60000,), (10000, 784), (10000,))

<br>

In [10]:
for n in [2, 4, 8, 16, 32]:
    #n = 32
    pca = PCA(n_components=n)
    x_train_pca = pca.fit_transform(x_train_flat)
    x_test_pca = pca.transform(x_test_flat)

    knn = KNeighborsClassifier(n_neighbors=11)
    knn.fit(x_train_pca, y_train)
    y_pred = knn.predict(x_test_pca)
    acc = accuracy_score(y_pred, y_test)
    print("PCA n=%d, accuracy is %.9f" % (n, acc))

PCA n=2, accuracy is 0.440800000
PCA n=4, accuracy is 0.648600000
PCA n=8, accuracy is 0.901500000
PCA n=16, accuracy is 0.961800000
PCA n=32, accuracy is 0.973200000


<br>

In [11]:
for n in [2, 4, 8, 16, 32]:
    pca = PCA(n_components=n)
    x_train_pca = pca.fit_transform(x_train_flat)
    x_test_pca = pca.transform(x_test_flat)
    
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train_pca)
    x_test_scaled = scaler.transform(x_test_pca)

    knn = KNeighborsClassifier(n_neighbors=11)
    knn.fit(x_train_scaled, y_train)
    y_pred = knn.predict(x_test_scaled)
    acc = accuracy_score(y_pred, y_test)
    print("PCA n=%d, accuracy is %.9f" % (n, acc))

PCA n=2, accuracy is 0.444700000
PCA n=4, accuracy is 0.645500000
PCA n=8, accuracy is 0.901400000
PCA n=16, accuracy is 0.959600000
PCA n=32, accuracy is 0.969400000


<br>

In [12]:
pca = PCA(n_components=32)
x_train_pca = pca.fit_transform(x_train_flat)
x_test_pca = pca.transform(x_test_flat)

for k in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca, y_train)
    y_pred = knn.predict(x_test_pca)
    acc = accuracy_score(y_pred, y_test)
    print("PCA n=%d, k=%d, accuracy is %.9f" % (n, k, acc))

PCA n=32, k=2, accuracy is 0.970600000
PCA n=32, k=3, accuracy is 0.973900000
PCA n=32, k=4, accuracy is 0.974900000
PCA n=32, k=5, accuracy is 0.975600000
PCA n=32, k=6, accuracy is 0.974500000
PCA n=32, k=7, accuracy is 0.974500000
PCA n=32, k=8, accuracy is 0.975100000
PCA n=32, k=9, accuracy is 0.974600000
PCA n=32, k=10, accuracy is 0.974300000
PCA n=32, k=11, accuracy is 0.973400000


<br>

In [13]:
pca = PCA(n_components=32)
x_train_pca = pca.fit_transform(x_train_flat)
x_test_pca = pca.transform(x_test_flat)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_pca, y_train)
y_pred = knn.predict(x_test_pca)
acc = accuracy_score(y_pred, y_test)
print("PCA n=32, k=5, accuracy is %.9f" % (acc))

PCA n=32, k=5, accuracy is 0.975700000


<br>

In [19]:
data_train = pd.concat([pd.DataFrame(x_train_pca), pd.DataFrame(y_train)], axis=1)
data_test = pd.concat([pd.DataFrame(x_test_pca), pd.DataFrame(y_test)], axis=1)
data_train.shape, data_test.shape

((60000, 33), (10000, 33))

In [20]:
data_train.to_csv('MNIST_train_60k.csv', header=None, index=None)
data_test.to_csv('MNIST_test_10k.csv', header=None, index=None)

<br>
<br>
<br>

In [24]:
data_train = pd.read_csv('MNIST_train_60k.csv', header=None)
data_test = pd.read_csv('MNIST_test_10k.csv', header=None)
X_train = data_train.drop(32, axis=1)
Y_train = data_train.iloc[:, 32]
X_test = data_test.drop(32, axis=1)
Y_test = data_test.iloc[:, 32]
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((60000, 32), (60000,), (10000, 32), (10000,))

In [26]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_pred, Y_test)
print("k=5, accuracy is %.9f" % (acc))

k=5, accuracy is 0.975700000
