# Multi-Layered Perceptron via Grid Search CV (ft. Adam vs LBFGS) # 

In [9]:
#import libraries
import numpy as np
import pandas
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import classification_report
import time

### 28 x 28 images ###

#### Default: Adam Optimizer #### 

The default 'adam' solver, according to the scikit-learn documentation for MLPClassifier, "works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score."

In [10]:
#import data
data_small = pandas.read_csv('data28.csv')
y = data_small['label']
X = data_small[data_small.columns[1:]]

# 80-20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [11]:
#preprocess

indices_to_keep=~X_train.isin([np.nan,np.inf,-np.inf]).any(1)

X_train=X_train[indices_to_keep]
y_train=y_train[indices_to_keep]

indices_to_keep_test=~X_test.isin([np.nan,np.inf,-np.inf]).any(1)

X_test=X_test[indices_to_keep_test]
y_test=y_test[indices_to_keep_test]

We perform a full grid search with the parameters specified below:

In [12]:
#Neural Network to generate predictions
def predNN_2(X_train, y_train, im_size = 28):
    clf = GridSearchCV(estimator=MLPClassifier(max_iter=50000),
          param_grid={'early_stopping' : [True, False],
                      'hidden_layer_sizes': [100, 200, 300, 400, 500],
                      'activation': ['relu', 'tanh', 'logistic'],
                      'learning_rate': ['constant', 'invscaling', 'adaptive'],
                      'learning_rate_init': [.0001, .001, .01, .1]},
                      n_jobs = -1, verbose = 4)
    
    # Note (put in report): we picked the default 'adam' solver since, according to the scikit-learn documentation for MLPClassifier, 
    # it "works pretty well on relatively large datasets (with thousands of training samples or more) 
    # in terms of both training time and validation score." Note that 'lgbfs' is very slow!!
    
    # Note: GridSearchCV by default performs 5-fold CV. 
    
    print("Fitting")
    t0 = time.time()
    clf.fit(X_train, y_train)
    t1 = time.time()
    print('Grid search CV time for', im_size, 'x', im_size, 'images took', t1 - t0, 'seconds')
    print("Optimized parameters:", clf.best_params_)
    print("Weighted validation score:", clf.best_score_)
    return clf

In [13]:
#Calling Neural Network for Predictions

classifier = predNN_2(X_train, y_train, 28)
preds = classifier.predict(X_test)

Fitting
Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Grid search CV time for 28 x 28 images took 6134.6064739227295 seconds
Optimized parameters: {'activation': 'logistic', 'early_stopping': False, 'hidden_layer_sizes': 400, 'learning_rate': 'constant', 'learning_rate_init': 0.0001}
Weighted validation score: 0.8779999999999999


In [14]:
# write cv results (such as mean fit times for each hyperparam configuration) to file
with open('fcnn_gridcv_28_adam.txt', 'w') as file:
    file.write(str(classifier.cv_results_))

In [15]:
activation = classifier.best_params_['activation']
hidden_layer_sizes = classifier.best_params_['hidden_layer_sizes']
learning_rate = classifier.best_params_['learning_rate']
learning_rate_init = classifier.best_params_['learning_rate_init']
early_stopping = classifier.best_params_['early_stopping']

classifier = MLPClassifier(early_stopping=early_stopping, activation=activation, hidden_layer_sizes=hidden_layer_sizes, max_iter=50000, learning_rate=learning_rate, learning_rate_init=learning_rate_init)
t0 = time.time()
classifier.fit(X_train, y_train)
t1 = time.time()
print('Training time time for', 28, 'x', 28, 'images took', t1 - t0, 'seconds')

Training time time for 28 x 28 images took 37.561065912246704 seconds


In [16]:
#training accuracy
train_accNN = accuracy_score(y_train, classifier.predict(X_train)) 
train_accNN

0.92225

In [17]:
#test accuracy
test_accNN = accuracy_score(y_test, classifier.predict(X_test)) 
test_accNN

0.88

#### Detour: LBFGS Optimizer (DO NOT RUN!!!) ####

We know that the LBFGS solver will take significantly longer, as it "converges faster and performs better" for small datasets, according to the documentation. 

We want to explore the following: how much does using the LBFGS solver affect the final test accuracy? 

LBFGS: Limited-memory Broyden–Fletcher–Goldfarb–Shanno. 

In [None]:
#import data
data_small = pandas.read_csv('data28.csv')
y = data_small['label']
X = data_small[data_small.columns[1:]]

# 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
#preprocess

indices_to_keep=~X_train.isin([np.nan,np.inf,-np.inf]).any(1)

X_train=X_train[indices_to_keep]
y_train=y_train[indices_to_keep]

indices_to_keep_test=~X_test.isin([np.nan,np.inf,-np.inf]).any(1)

X_test=X_test[indices_to_keep_test]
y_test=y_test[indices_to_keep_test]

In [None]:
#Neural Network to generate predictions
def predNN_3(X_train, y_train, im_size = 28):
    clf = GridSearchCV(estimator=MLPClassifier(solver='lbfgs', max_iter=50000),
          param_grid={'early_stopping' : [True, False],
                      'hidden_layer_sizes': [100, 200, 300, 400, 500],
                      'activation': ['relu', 'tanh', 'logistic'],
                      'learning_rate': ['constant', 'invscaling', 'adaptive'],
                      'learning_rate_init': [.0001, .001, .01, .1]}, n_jobs=-1,verbose=4)
    
    # Pick 20 sample configurations to perform CV on. 
    
    print("Fitting")
    t0 = time.time()
    clf.fit(X_train, y_train)
    t1 = time.time()
    print('Grid search CV time for', im_size, 'x', im_size, 'images via LBFGS took', t1 - t0, 'seconds')
    print("Optimized parameters:", clf.best_params_)
    print("Weighted validation score:", clf.best_score_)
    return clf

In [None]:
#Calling Neural Network for Predictions

classifier = predNN_3(X_train, y_train, 28)
preds = classifier.predict(X_test)

In [None]:
# write cv results (such as mean fit times for each hyperparam configuration) to file
with open('fcnn_gridcv_28_lbfgs.txt', 'w') as file:
    file.write(str(classifier.cv_results_))

In [None]:
activation = classifier.best_params_['activation']
hidden_layer_sizes = classifier.best_params_['hidden_layer_sizes']

classifier = MLPClassifier(early_stopping=early_stopping, activation=activation, hidden_layer_sizes=hidden_layer_sizes, max_iter=50000, learning_rate=learning_rate, learning_rate_init=learning_rate_init)
t0 = time.time()
classifier.fit(X_train, y_train)
t1 = time.time()
print('Training time time for', 28, 'x', 28, 'images took', t1 - t0, 'seconds')

In [None]:
#training accuracy
train_accNN = accuracy_score(y_train, classifier.predict(X_train)) 
train_accNN

In [None]:
#test accuracy
test_accNN = accuracy_score(y_test, classifier.predict(X_test)) 
test_accNN

After running the above grid search CV for 11 hours, and having observed that it's barely halfway finished, we can conclude that the LBFGS is infeasible for our 5000-example large dataset. 

### 64 x 64 images ###

Used Adam optimizer for computational efficiency.

In [18]:
#import data
data_large = pandas.read_csv('data64.csv')
y = data_large['label']

X = data_large[data_large.columns[1:]]

# 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [19]:
#preprocess

indices_to_keep=~X_train.isin([np.nan,np.inf,-np.inf]).any(1)

X_train=X_train[indices_to_keep]
y_train=y_train[indices_to_keep]

indices_to_keep_test=~X_test.isin([np.nan,np.inf,-np.inf]).any(1)

X_test=X_test[indices_to_keep_test]
y_test=y_test[indices_to_keep_test]

In [20]:
#Calling Neural Network for Predictions

classifier = predNN_2(X_train, y_train, 64)
preds = classifier.predict(X_test)

Fitting
Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Grid search CV time for 64 x 64 images took 25156.27631998062 seconds
Optimized parameters: {'activation': 'relu', 'early_stopping': True, 'hidden_layer_sizes': 500, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0001}
Weighted validation score: 0.79775


In [21]:
# write cv results (such as mean fit times for each hyperparam configuration) to file
with open('fcnn_gridcv_64_adam.txt', 'w') as file:
    file.write(str(classifier.cv_results_))

In [22]:
activation = classifier.best_params_['activation']
hidden_layer_sizes = classifier.best_params_['hidden_layer_sizes']
learning_rate = classifier.best_params_['learning_rate']
learning_rate_init = classifier.best_params_['learning_rate_init']
early_stopping = classifier.best_params_['early_stopping']

classifier = MLPClassifier(early_stopping=early_stopping, activation=activation, hidden_layer_sizes=hidden_layer_sizes, max_iter=50000, learning_rate=learning_rate, learning_rate_init=learning_rate_init)
t0 = time.time()
classifier.fit(X_train, y_train)
t1 = time.time()
print('Training time time for', 64, 'x', 64, 'images took', t1 - t0, 'seconds')

Training time time for 64 x 64 images took 78.14718914031982 seconds


In [23]:
#training accuracy
train_accNN = accuracy_score(y_train, classifier.predict(X_train)) 
train_accNN

0.8815

In [25]:
#test accuracy
test_accNN = accuracy_score(y_test, classifier.predict(X_test)) 
test_accNN

0.811

Note: We did not use the LBFGS solver for 64 x 64 images due to its expensive computational cost.