In [3]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from itertools import permutations 

from math import ceil

from matplotlib import pyplot as plt
import seaborn as sns


In [4]:
cfier = MLPClassifier(hidden_layer_sizes=(7,), random_state = 13, solver="lbfgs", verbose=True)
data = pd.read_csv("../Data/95_percent_var_PCs.csv")
# Encode the 'AD' and 'ND' classificaitons as integers. 0 for ND, 1 for AD
data.loc[:,"target"] = data.loc[:,"target"].apply(lambda c: int(c == "AD"))


In [5]:
# Split into training and testing data
train_data, test_data = train_test_split(data, train_size=0.8, random_state=23)
# training lalbels and training data
train_lbls = train_data["target"]
train_data = train_data.loc[:, "PC1":"PC164"]
# testing labels and test data
test_lbls = test_data["target"]
test_data = test_data.loc[:, "PC1":"PC164"]
# Scale the data so that it's standardized (mean = 0, variance = 1)

scaler = StandardScaler().fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)


In [4]:
mlp_1layer_cfier = MLPClassifier(max_iter=300, solver="lbfgs")
mlp_2layer_cfier = MLPClassifier(max_iter=300, solver="lbfgs")
mlp_3layer_cfier = MLPClassifier(max_iter=300, solver="lbfgs")

alpha_range = 10.0 ** -np.arange(1,7) # As suggested by sklearn documentation 
# The parameters dictionary defines the search grid for the hyperparameters being optimized over
# Each dictionary key is the name the pipeline stage, followed by that stage's parameter being optimized.  
parameters_1layer = {
                "alpha": list(10.0 ** -np.arange(1,7)),
                "hidden_layer_sizes": list(permutations([i for i in range(1,300,1)],1)) 
                }

parameters_2layer = {
                "alpha": list(10.0 ** -np.arange(1,7)),
                "hidden_layer_sizes": list(permutations([i for i in range(1,300,5)],2)) 
                }

parameters_3layer = {
                "alpha": list(10.0 ** -np.arange(1,7)),
                "hidden_layer_sizes": list(permutations([i for i in range(1,300,10)],3)) 
                }


'\nparameters_2layer = {\n                "alpha": list(10.0 ** -np.arange(1,7)),\n                "hidden_layer_sizes": list(permutations([i for i in range(1,300,5)],2)) \n                }\n\nparameters_3layer = {\n                "alpha": list(10.0 ** -np.arange(1,7)),\n                "hidden_layer_sizes": list(permutations([i for i in range(1,300,10)],3)) \n                }\n  \nparameters_4layer = {\n                "alpha": list(10.0 ** -np.arange(1,7)),\n                "hidden_layer_sizes": list(permutations([i for i in range(10,300,25)],4)) \n                }    \n'

In [6]:
# Running the Grid Search. Each possible permunation of alpha and 1-layer architectures from 1-300 
# n_jobs = 7 means use 6 of my 12 processors. 

# Start by searching 1-layer architectures with 1-300 neurons, increments of 1 neuron
grid_search_pipeline = GridSearchCV(mlp_1layer_cfier, parameters_1layer, verbose=2, n_jobs=-1) 

grid_search_pipeline.fit(train_data, train_lbls)

print(f"Best 1-layer architecture {grid_search_pipeline.best_params_}")
print(f"Accuracy score: {grid_search_pipeline.best_score_:.4%}")


# Move on to 2-layer architectures with 1-300 neurons, increments of 5 neurons
grid_search_pipeline_2l = GridSearchCV(mlp_2layer_cfier, parameters_2layer, verbose=2, n_jobs=-1) 

grid_search_pipeline_2l.fit(train_data, train_lbls)

print(f"Best 2-layer architecture {grid_search_pipeline_2l.best_params_}")
print(f"Accuracy score: {grid_search_pipeline_2l.best_score_:.4%}")

# Move on to 3-layer architectures with 1-300 neurons, increments of 10 neurons
grid_search_pipeline_3l = GridSearchCV(mlp_3layer_cfier, parameters_3layer, verbose=2, n_jobs=-1) 

grid_search_pipeline_3l.fit(train_data, train_lbls)

print(f"Best 3-layer architecture {grid_search_pipeline_3l.best_params_}")
print(f"Accuracy score: {grid_search_pipeline_3l.best_score_:.4%}")


Fitting 5 folds for each of 1794 candidates, totalling 8970 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4877 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 5808 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 6821 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 7914 tasks      | elapsed:  7.0min


Best 1-layer architecture {'alpha': 1e-06, 'hidden_layer_sizes': (15,)}
Accuracy score: 64.6774%


[Parallel(n_jobs=-1)]: Done 8970 out of 8970 | elapsed:  7.8min finished


'\n# Move on to 2-layer architectures with 1-300 neurons, increments of 5 neurons\ngrid_search_pipeline_2l = GridSearchCV(mlp_2layer_cfier, parameters_2layer, verbose=2, n_jobs=-1) \n\ngrid_search_pipeline_2l.fit(train_data, train_lbls)\n\nprint(f"Best 2-layer architecture {grid_search_pipeline_2l.best_params_}")\nprint(f"Accuracy score: {grid_search_pipeline_2l.best_score_:.4%}")\n\n# Move on to 3-layer architectures with 1-300 neurons, increments of 10 neurons\ngrid_search_pipeline_3l = GridSearchCV(mlp_3layer_cfier, parameters_3layer, verbose=2, n_jobs=-1) \n\ngrid_search_pipeline_3l.fit(train_data, train_lbls)\n\nprint(f"Best 3-layer architecture {grid_search_pipeline_3l.best_params_}")\nprint(f"Accuracy score: {grid_search_pipeline_3l.best_score_:.4%}")\n'

In [None]:
with open("best_architectures.txt", "w") as f:
    results = f"Best 1-hidden-layer score: {grid_search_pipeline.best_score_:.4%}\n\t{grid_search_pipeline.best_params_}"
    results += f"\n\nBest 2-hidden-layer score: {grid_search_pipeline_2l.best_score_:.4%}\n\t{grid_search_pipeline_2l.best_params_}"
    results += f"\n\nBest 3-hidden-layer score: {grid_search_pipeline_3l.best_score_:.4%}\n\t{grid_search_pipeline_3l.best_params_}"
    f.write(results)

In [14]:
print(f"Best score: {grid_search_pipeline.best_score_:.4%}")
print(f"Best estimator {grid_search_pipeline.best_params_}")

Best score: 72.3992%
Best estimator {'alpha': 0.1, 'hidden_layer_sizes': (90, 130, 60, 40)}


In [18]:
grid_search_pipeline.decis

# Bayesian hyperparameter searching

In [12]:
from hyperopt import hp, Trials, fmin, tpe, space_eval, STATUS_OK, STATUS_FAIL
from timeit import default_timer as timer
from statistics import mean


## Experimenting with various search spaces
I think it's wise to keep alpha as a uniform distribution. I'll start with normal distributions for the number now neurons in each layer and see where that takes things. 

1. Search space 1: normal distributions with large variance. 
    - layer 1: ~N(mu = 150 neurons, sigma = 65)
    - layer 2: ~N(mu = 75 neurons, sigma = 32)
    - alpha: uniform {0.1, 0.01,... 10^-7}

In [21]:
# Define the search space. Dictionaries are in key:value where the value is a probabilistic space to 
# "randomly" chose from (I believe as TPE algorithm works, it changes these probabilistic distributions)
search_space =  hp.choice("layers", [
    {
        "type": "two_layer",
        "k": hp.quniform("2_layer_k",1,7,1), # k is selecting for bias alpha, where alpha = 10^-k [0.1, 0.01, ... 10^-7]
        "l1": hp.qnormal("2_layer_l1",150, 65, 2), # Discretized normal dist, mu =150, stdev=65
        "l2": hp.qnormal("2_layer_l2",75, 32, 2) #  mu =75, stdev=32
    },
    {
        "type": "three_layer",
        "k": hp.uniform("3_layer_k",1,7),
        "l1": hp.qnormal("3layer_l1",150, 65, 2), # Discretized normal dist, mu =150, stdev=65
        "l2": hp.qnormal("3_layer_l2",75, 32, 2), # mu =75, stdev=32
        "l3": hp.qnormal("3_layer_l3",40, 20, 2) # Discretized normal dist, mu =75, stdev=32
    }
])

# Define the optimization function (Costly as it implies fitting a MLP classifier via CV)
def objective(architecture):
    """ The objective function to optimize is the 5-fold cross-validation fitting of a 2 hidden layer NN classifier.
        The hyperoptimizer operates as a minimizer, so the returned loss will be the negative of the avg of 
        mean accuracy.
    """
    # Determine classifer hyperparams from architecture dictionary given
    if architecture["type"] == "two_layers":
        shape = (2, int(architecture["l1"]), int(architecture["l2"]))
    else:
        shape = (2, int(architecture["l1"]), int(architecture["l2"]), int(architecture["l3"]))
    k = architecture["k"]
    
    # Make a new MLP Classifier
    cfier = MLPClassifier(max_iter=300, hidden_layer_sizes=shape, solver= "lbfgs", alpha = pow(10, -))
    
    # Cross Validatae
    
    
    print(architecture)
    

    
# Create a trials Hyperopt Object which stores data regarding the results of our hyperparameter search
trials = Trials()

# Run the hyperparam search. We'll let the hyperopt lib suggest the tpe to use
best_architectue = fmin(fn=objective, space=search_space, algo=tpe.suggest, trials=trials, max_evals=750, 
                        rstate= np.random.RandomState(25))


    

{'k': 6.0, 'l1': 142.0, 'l2': 50.0, 'type': 'two_layer'}
  0%|          | 0/750 [00:00<?, ?trial/s, best loss=?]

job exception: Stop this damn thing!



  0%|          | 0/750 [00:00<?, ?trial/s, best loss=?]


Exception: Stop this damn thing!