In [None]:
import numpy as np
from keras.datasets import mnist
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(train_y.shape))
print('X_test: ' + str(test_X.shape))
print('Y_test: ' + str(test_y.shape))


X_train: (60000, 28, 28)
Y_train: (60000,)
X_test: (10000, 28, 28)
Y_test: (10000,)


# Random Forest Classifier

In [None]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(train_X.reshape((-1, 784)), train_y)
pred_y = clf.predict(test_X.reshape((-1, 784)))
acc = accuracy_score(test_y, pred_y)
print(f'Accuracy: {acc}')

Accuracy: 0.9469


# kNN Classifier

In [None]:
k = 3
neigh = KNeighborsClassifier(n_neighbors=k)
neigh.fit(train_X.reshape((-1, 784)), train_y)
pred_y = neigh.predict(test_X.reshape((-1, 784)))
acc = accuracy_score(test_y, pred_y)
print(f'Accuracy: {acc}')

Accuracy: 0.9705


# EM Clustering

Clustering on 784 features takes a long time, so we use pca to reduce dimensionality and then cluster.

In [None]:
pca = PCA(n_components=10)
pca.fit(train_X.reshape((-1, 784)))
train_x_transformed = pca.transform(train_X.reshape((-1, 784)))
test_x_transformed = pca.transform(test_X.reshape((-1, 784)))
train_x_transformed.shape, test_x_transformed.shape

((60000, 10), (10000, 10))

In [None]:
gm = GaussianMixture(n_components=10, random_state=0).fit(train_x_transformed)
preds = gm.predict(test_x_transformed)

In [None]:
accs = []
for i in range(10):
    temp = test_y[preds == i]
    bincounts = np.bincount(temp)
    acc = bincounts.max() / (bincounts.sum())
    accs.append(acc)
total_acc = sum(accs)/len(accs)
print(f'Accuracy: {total_acc}')

Accuracy: 0.6758489241963149
