In [None]:
import numpy as np
import scipy.stats
import pickle
import util
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
seed=229
state = np.random.RandomState(seed)

trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile, impute=True)

Neural nets need to work with standardized data:

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=state)

A function to determine the number of neurons in a hidden layer:

In [None]:
# Based on https://stats.stackexchange.com/a/136542
def num_hidden(a):
    return int(m/(a * (n_inputs + n_outputs)))

m = X.shape[0]
n_inputs = X.shape[1]
n_outputs = 1

avals = range(2,10)

# One hidden layer: Training
Skip to `Evaluation` to produce plots and an output.

In [None]:
params = {'solver': ['adam'],
        'hidden_layer_sizes': [(n,) for n in np.arange(100,300,20)],
        'activation': ['logistic'], 
        'alpha': np.logspace(-7, -2, 10),
        'early_stopping': [True, False], 
        #'warm_start': True,
       }

mlp_onelayer = MLPClassifier(verbose=False)
cv_onelayer = GridSearchCV(mlp_onelayer, param_grid=params, scoring=util.gini_proba_scorer, n_jobs=-1,
                          verbose=2)
cv_onelayer.fit(X, Y)

In [None]:
mlp_onelayer_out = open('models/multilayer_perceptron_onelayer.pickle', 'wb')
pickle.dump(cv_onelayer.best_estimator_, mlp_onelayer_out)
mlp_onelayer_out.close()

# One hidden layer: Evaluation

In [None]:
mlp_onelayer = pickle.load(open('models/multilayer_perceptron_onelayer.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(mlp_onelayer, X_train, X_test, y_train, y_test,
                                                      scorer=util.gini_proba_scorer)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Multi-layer perceptron, one hidden layer')
plt.legend()
plt.savefig('figures/learning_curves/mlp_onelayer.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# One hidden layer: Output

In [None]:
mlp_onelayer.fit(X, Y)
print(util.gini_proba_scorer(mlp_onelayer, X, Y))

In [None]:
testfile = 'data/test.csv'
util.make_prediction(mlp_onelayer, testfile, 'predictions/mlp_onelayer.csv', scaler=scaler,
                     predict_method=util.proba_method(mlp_onelayer))

# Two hidden layers: Training

In [None]:
params = {'solver': ['adam'],
        'hidden_layer_sizes': [(n, int(np.sqrt(n))) for n in np.arange(100,320,20)],
        'activation': ['relu', 'logistic'], 
        'alpha': np.logspace(-7, -2, 10),
        'early_stopping': [True, False], 
        #'warm_start': True,
       }

mlp_twolayer = MLPClassifier(verbose=False)
cv_twolayer = GridSearchCV(mlp_twolayer, param_grid=params, scoring=util.gini_proba_scorer, n_jobs=-1,
                          verbose=2)
cv_twolayer.fit(X, Y)

In [None]:
mlp_twolayer_out = open('models/multilayer_perceptron_twolayer.pickle', 'wb')
pickle.dump(cv_twolayer.best_estimator_, mlp_twolayer_out)
mlp_twolayer_out.close()

# Two hidden layers: Evaluation

In [None]:
mlp_twolayer = pickle.load(open('models/multilayer_perceptron_twolayer.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(mlp_twolayer, X_train, X_test, y_train, y_test,
                                                      scorer=util.gini_proba_scorer)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Multi-layer perceptron, two hidden layers')
plt.legend()
plt.savefig('figures/learning_curves/mlp_twolayer.eps', format='eps', dpi=1000)

# Two hidden layers: Output

In [None]:
mlp_twolayer.fit(X, Y)
print(util.gini_proba_scorer(mlp_twolayer, X, Y))

In [None]:
testfile = 'data/test.csv'
util.make_prediction(mlp_twolayer, testfile, 'predictions/mlp_twolayer.csv', scaler=scaler,
                     predict_method=util.proba_method(mlp_twolayer))