## MNIST Data Preprocessing

In [10]:
import pickle
import gzip
import numpy as np
import matplotlib.pyplot as plt

path = 'mnist.pkl.gz'
f = gzip.open(path, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()

x_train, y_train = training_data[0], training_data[1]
print (x_train.shape, y_train.shape)

x_test, y_test = test_data[0], test_data[1]
print (x_test.shape, y_test.shape)

(50000, 784) (50000,)
(10000, 784) (10000,)


## USPS Data Preprocessing

In [11]:
from PIL import Image
import os
import numpy as np

#USPS data preprocessing
USPSMat  = []
USPSTar  = []
curPath  = 'USPSdata/Numerals'
savedImg = []


for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)
usps_data = np.array(USPSMat) 
usps_lables = np.array(USPSTar)


## SVM Classifier

In [12]:
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn import svm, metrics, datasets

classifier = svm.SVC(kernel='rbf', C=2,gamma=0.001)
classifier.fit(x_train, y_train )

#testing using mnist
expected = y_test
predicted = classifier.predict(x_test)
np.savetxt("svm.csv", predicted, delimiter=",")
acc_svm = accuracy_score(expected, predicted)
print ("SVM accuracy: ",acc_svm)
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


SVM accuracy:  0.9456
Classification report for classifier SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.96      0.99      0.97       980
          1       0.97      0.99      0.98      1135
          2       0.93      0.94      0.94      1032
          3       0.92      0.94      0.93      1010
          4       0.93      0.95      0.94       982
          5       0.93      0.91      0.92       892
          6       0.95      0.97      0.96       958
          7       0.96      0.93      0.94      1028
          8       0.95      0.92      0.93       974
          9       0.94      0.92      0.93      1009

avg / total       0.95      0.95      0.95     10000


Confusion matrix:
[[ 967    0    1    0    0    5    4    1    2    0]
 [   0 

### Testing using USPS

In [13]:
print("testing using USPS dataset")
expected = usps_lables
predicted = classifier.predict(usps_data)
np.savetxt("svm_usps.csv", predicted, delimiter=",")
acc_svm_usps = accuracy_score(expected, predicted)
print ("SVM USPS accuracy: ",acc_svm_usps)
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


testing using USPS dataset
SVM USPS accuracy:  0.38516925846292316
Classification report for classifier SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.44      0.29      0.35      2000
          1       0.46      0.21      0.29      2000
          2       0.34      0.71      0.46      1999
          3       0.48      0.56      0.52      2000
          4       0.53      0.57      0.55      2000
          5       0.29      0.67      0.40      2000
          6       0.66      0.37      0.48      2000
          7       0.24      0.23      0.23      2000
          8       0.35      0.12      0.18      2000
          9       0.29      0.11      0.16      2000

avg / total       0.41      0.39      0.36     19999


Confusion matrix:
[[ 580    2  42

## Random Forests Classifier

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()
clf_rf.fit(x_train, y_train)
y_pred_rf = clf_rf.predict(x_test)
np.savetxt("rf.csv", y_pred_rf, delimiter=",")
acc_rf = accuracy_score(y_test, y_pred_rf)
print ("Random forest accuracy: ",acc_rf)
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(y_test, y_pred_rf)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred_rf))

Random forest accuracy:  0.9445
Classification report for classifier SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.94      0.98      0.96       980
          1       0.98      0.99      0.98      1135
          2       0.93      0.95      0.94      1032
          3       0.90      0.93      0.92      1010
          4       0.95      0.95      0.95       982
          5       0.92      0.91      0.91       892
          6       0.96      0.95      0.96       958
          7       0.95      0.94      0.95      1028
          8       0.95      0.90      0.93       974
          9       0.95      0.92      0.94      1009

avg / total       0.94      0.94      0.94     10000


Confusion matrix:
[[ 964    1    0    3    0    4    4    1    2    

### Testing using USPS

In [15]:
print("testing using USPS dataset")
expected = usps_lables
predicted = clf_rf.predict(usps_data)
np.savetxt("rf_usps.csv", predicted, delimiter=",")
acc_rf_usps = accuracy_score(expected, predicted)
print ("Random forest USPS accuracy: ",acc_rf_usps)
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


testing using USPS dataset
Random forest USPS accuracy:  0.31636581829091454
Classification report for classifier SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.35      0.30      0.33      2000
          1       0.34      0.39      0.36      2000
          2       0.31      0.51      0.38      1999
          3       0.35      0.52      0.42      2000
          4       0.40      0.42      0.41      2000
          5       0.29      0.42      0.35      2000
          6       0.50      0.24      0.32      2000
          7       0.18      0.26      0.21      2000
          8       0.33      0.06      0.11      2000
          9       0.18      0.04      0.07      2000

avg / total       0.32      0.32      0.30     19999


Confusion matrix:
[[ 60