In [None]:
# Imports
import matplotlib.pyplot as plt
import random, pdb
import numpy as np
import tensorflow as tf
import time
import pandas as pd

#svm imports
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

#Neural net import
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

#Linear classifier
from sklearn.linear_model import LogisticRegression

# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Fetching MNIST Data (Code from Assignment 1)
mnist = tf.keras.datasets.mnist
(train_data, train_labels),(test_data, test_labels) = mnist.load_data()

print("Number of training examples\t",len(train_data))
print("Number of test instances\t",len(test_data))
print("Shape of train_data\t\t",train_data.shape)
print("Shape of train_labels\t\t",train_labels.shape)
print("Shape of test_data\t\t",test_data.shape)
print("Shape of test_labels\t\t",test_labels.shape)

Number of training examples	 60000
Number of test instances	 10000
Shape of train_data		 (60000, 28, 28)
Shape of train_labels		 (60000,)
Shape of test_data		 (10000, 28, 28)
Shape of test_labels		 (10000,)


In [4]:
# Vectorize test and training data
test_data = test_data.reshape(test_data.shape[0], test_data.shape[1] * test_data.shape[2]).astype('float32')
train_data = train_data.reshape(train_data.shape[0], train_data.shape[1] * train_data.shape[2]).astype('float32')

print('New shape of test_data', test_data.shape)
print('New shape of train_data', train_data.shape)

New shape of test_data (10000, 784)
New shape of train_data (60000, 784)


In [5]:
# Scale data to 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
train_data = scaler.fit_transform(train_data)
test_data = scaler.fit_transform(test_data)

### SVC - State Vector Classifier

In [24]:
# Grid search for SVC on MNIST
parameters = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C':[0.001, 0.1, 100, 10e5],
    'gamma':[10,1,0.1,0.01]
}
grid_svc = GridSearchCV(svm.SVC(), param_grid=parameters, cv=1, n_jobs=-1)

In [None]:
three_class_train_data = train_data[np.isin(train_labels, [1,2,3])]
three_class_train_labels = train_labels[np.isin(train_labels, [1,2,3])]

three_class_test_data = test_data[np.isin(test_labels, [1,2,3])]
three_class_test_labels = test_labels[np.isin(test_labels, [1,2,3])]

grid_svc.fit(three_class_train_data, three_class_train_labels)

In [None]:
print(grid_svc.score(three_class_test_data, three_class_test_labels))
print(grid_svc.best_params_)

df = pd.DataFrame(grid_svc.cv_results_)
df.to_pickle('mnist_svc.pkl')

### Random Forest Classifier

In [5]:
# Random Forests, grid search to find best parameter and run cross validation
parameters = {
    'n_estimators': [100],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)]
}
grid_rfc = GridSearchCV(RandomForestClassifier(), param_grid=parameters, cv=1, n_jobs=-1)



0.9477

In [None]:
grid_rfc.fit(train_data, train_labels)

In [None]:
print(grid_rfc.score(test_data, test_labels))
print(grid_rfc.best_params_)

df = pd.DataFrame(grid_rfc.cv_results_)
df.to_pickle('mnist_rfc.pkl')

### Linear Classifier

In [None]:
# Linear Classifier
parameters = {
    'solver': ['sag', 'lbfgs'],
    'multi_class': ['ovr']
}
grid_lr = GridSearchCV(LogisticRegression(), param_grid=parameters, cv=1, n_jobs=-1)

lc = LogisticRegression(solver='sag', multi_class='ovr')


In [None]:
grid_lr.fit(train_data, train_labels)

In [None]:
print(grid_lr.score(test_data, test_labels))
print(grid_lr.best_params_)

df = pd.DataFrame(grid_lr.cv_results_)
df.to_pickle('mnist_lr.pkl')

### Neural Network

In [None]:
# Neural Nets
parameters = {
    'activation': ['logistic'],
    'solver': ['sgd'],
    'max_iter': [100, 500],
    'learning_rate_init': [0.001, 0.01, 0.1, 1],
    'hidden_layer_sizes': np.arange(2,6)
}
grid_nn = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1, cv=5, return_train_score=True) # using all processors n_jobs=-1 (multicores)

In [None]:
grid_nn.fit(train_data, train_labels)

In [None]:
print(grid_nn.score(test_data, test_labels))
print(grid_nn.best_params_)

df = pd.DataFrame(grid_nn.cv_results_)
df.to_pickle('mnist_nn.pkl')