In [2]:
import sklearn
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
    
#getting the mnist data the way shown in Notebook #7
# mnist["data"], 

y = mnist["target"]

y[y<9] = 0

#making all target variables that are not 9 to be equal to 0, so we can make a binary classifier
#print(y)

# example = mnist["data"][-1].reshape(28, 28)

# plt.imshow(example, cmap=mpl.cm.binary)
# plt.show()

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(mnist["data"][:5000], y[:5000], test_size=0.2)
#splitting data into training set and test set

knn = KNeighborsClassifier(n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
#using minikowski of order 2 for knn classifier from sklearn

knn.fit(X_train, y_train)
#fitting data to knn classifier

y_pred = knn.predict(X_test)
#making predictions on test data based on training data

from sklearn import metrics

print("Accuracy for KNN:",metrics.accuracy_score(y_test, y_pred))
#computing accuracy of predictions

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy for Decision Tree:",metrics.accuracy_score(y_test, y_pred))
#same format as above, but for decision tree

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=10)
rfc.fit(X_train, y_train)
print ("Accuracy for Random Forest:",rfc.score(X_test, y_test))
#same as above, but for random forest


Accuracy for KNN: 0.982
Accuracy for Decision Tree: 0.93
Accuracy for Random Forest: 0.953


In [9]:
import sklearn
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

X_train, X_test, y_train, y_test = train_test_split(mnist["data"][:5000], mnist["target"][:5000], test_size=0.2)
#this time using original dataset will all values

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_train)
y_kmeans = kmeans.predict(X_test)
#fitting kmeans with 10 clusters to data

print("Accuracy for Kmeans Clustering:",metrics.accuracy_score(y_test, y_kmeans))

knn = KNeighborsClassifier(n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
#metric p=2
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn import metrics
print("Accuracy for KNN:",metrics.accuracy_score(y_test, y_pred))
#computing knn as before, but using all 10 values

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy for Decision Tree:",metrics.accuracy_score(y_test, y_pred))
#computing decision tree as before, but using all 10 values

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=10)
rfc.fit(X_train, y_train)
print ("Accuracy for Random Forest:",rfc.score(X_test, y_test))
#computing random forest as before, but using all 10 values

#Conclusion
#It seems that for all the classification algorithms it is a lot easier to to a binary classifier of just classifing the digits if they are 9 or not a 9 then adding the 
#complexity of trying to identify each individual digit.
#It also seems that for both cases knn seems to perform the best out of the 3 classifiers and random forest to follow. It would make sense for random forest to be more
#accurate than decision tree since a random forest is just a aggregate of many decision trees.
#Finally the kmeans unsupervised clustering did not seem to classify the data very well at all. On some random seeds the kmeans was able to get up to 30% accuracy on the data,
#but for the most part the accuracy hovered around 10%. I am not sure the implementation is the issue or if kmeans is just not a very good classifier for this dataset.

Accuracy for Kmeans Clustering: 0.095
Accuracy for KNN: 0.932
Accuracy for Decision Tree: 0.784
Accuracy for Random Forest: 0.887
