## A3 Machine Learning
Done By:
- Nikhil Tyagi (B00809791)
- Aniruddha Chitley (B00808320)
- Nitish Bhardwaj (B00811535)

In [2]:
# Imports
import matplotlib.pyplot as plt
import random, pdb
import numpy as np
import tensorflow as tf
import time
import pandas as pd

#svm imports
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

#Neural net import
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

#Linear classifier
from sklearn.linear_model import LogisticRegression

# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Fetching MNIST Data (Code from Assignment 1)
mnist = tf.keras.datasets.mnist
(train_data, train_labels),(test_data, test_labels) = mnist.load_data()

print("Number of training examples\t",len(train_data))
print("Number of test instances\t",len(test_data))
print("Shape of train_data\t\t",train_data.shape)
print("Shape of train_labels\t\t",train_labels.shape)
print("Shape of test_data\t\t",test_data.shape)
print("Shape of test_labels\t\t",test_labels.shape)

Number of training examples	 60000
Number of test instances	 10000
Shape of train_data		 (60000, 28, 28)
Shape of train_labels		 (60000,)
Shape of test_data		 (10000, 28, 28)
Shape of test_labels		 (10000,)


In [4]:
# Vectorize test and training data
test_data = test_data.reshape(test_data.shape[0], test_data.shape[1] * test_data.shape[2]).astype('float32')
train_data = train_data.reshape(train_data.shape[0], train_data.shape[1] * train_data.shape[2]).astype('float32')

print('New shape of test_data', test_data.shape)
print('New shape of train_data', train_data.shape)

New shape of test_data (10000, 784)
New shape of train_data (60000, 784)


In [5]:
# Scale data to 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
train_data = scaler.fit_transform(train_data)
test_data = scaler.fit_transform(test_data)

### SVC - State Vector Classifier

In [24]:
# Grid search for SVC on MNIST
parameters = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C':[0.001, 0.1, 100, 10e5],
    'gamma':[10,1,0.1,0.01]
}
grid_svc = GridSearchCV(svm.SVC(), param_grid=parameters, cv=1, n_jobs=-1, return_train_score=True)

In [None]:
three_class_train_data = train_data[np.isin(train_labels, [1,2,3])]
three_class_train_labels = train_labels[np.isin(train_labels, [1,2,3])]

three_class_test_data = test_data[np.isin(test_labels, [1,2,3])]
three_class_test_labels = test_labels[np.isin(test_labels, [1,2,3])]

grid_svc.fit(three_class_train_data, three_class_train_labels)

In [None]:
print(grid_svc.score(three_class_test_data, three_class_test_labels))
print(grid_svc.best_params_)

df = pd.DataFrame(grid_svc.cv_results_)
df.to_pickle('mnist_svc.pkl')

### Random Forest Classifier

In [8]:
# Random Forests, grid search to find best parameter and run cross validation
parameters = {
    'n_estimators': [100],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)]
}
grid_rfc = GridSearchCV(RandomForestClassifier(), param_grid=parameters, cv=2, n_jobs=-1, return_train_score=True)

In [9]:
grid_rfc.fit(train_data, train_labels)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
print(grid_rfc.score(test_data, test_labels))
print(grid_rfc.best_params_)

df = pd.DataFrame(grid_rfc.cv_results_)
df.to_pickle('mnist_rfc.pkl')

0.9696
{'max_depth': 110, 'n_estimators': 100}




### Linear Classifier

In [13]:
# Linear Classifier
parameters = {
    'solver': ['sag', 'lbfgs'],
    'multi_class': ['ovr']
}
grid_lr = GridSearchCV(LogisticRegression(), param_grid=parameters, cv=2, n_jobs=-1, return_train_score=True)

In [14]:
grid_lr.fit(train_data, train_labels)



GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'solver': ['sag', 'lbfgs'], 'multi_class': ['ovr']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
print(grid_lr.score(test_data, test_labels))
print(grid_lr.best_params_)

df = pd.DataFrame(grid_lr.cv_results_)
df.to_pickle('mnist_lr.pkl')

0.9202
{'multi_class': 'ovr', 'solver': 'sag'}


### Neural Network

In [None]:
# Neural Nets
parameters = {
    'activation': ['logistic'],
    'solver': ['sgd'],
    'max_iter': [100, 300, 500, 1000],
    'learning_rate_init': [0.001, 0.01, 0.1, 1],
    'hidden_layer_sizes': [(300,),(379,),(500,),(10, 15), (100, 150), (550, 200)]
}
grid_nn = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1, cv=5, return_train_score=True)

In [None]:
grid_nn.fit(train_data, train_labels)

In [None]:
print(grid_nn.score(test_data, test_labels))
print(grid_nn.best_params_)

df = pd.DataFrame(grid_nn.cv_results_)
df.to_pickle('mnist_nn.pkl')