In [None]:
# Uncomment to upgrade packages
# !pip install pandas --upgrade --user --quiet
# !pip install numpy --upgrade --user --quiet
# !pip install scipy --upgrade --user --quiet
# !pip install statsmodels --upgrade --user --quiet
# !pip install scikit-learn --upgrade --user --quiet
# !pip install graphviz --upgrade --user --quiet
%load_ext autoreload

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
pd.set_option('precision', 3)
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Extra imports
from sklearn.metrics import confusion_matrix,\
                classification_report, accuracy_score
from pandas import read_csv
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from graphviz import Digraph
from sklearn.model_selection import GridSearchCV
from numpy.random import normal
from numpy.random import uniform

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from time import time
from datetime import timedelta

from sklearn.metrics import precision_score, classification_report, recall_score

from sklearn.datasets import fetch_lfw_people


In [None]:
def confusion(true, pred, classes):
    """
    Function for pretty printing confusion matrices
    """
    cm =pd.DataFrame(confusion_matrix(true, pred), index=classes,
                 columns=classes)
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    return cm

def graphMLP(vars,layers,intercepts):
    """
    Function for plotting the weights of a mlp
    """
    f = Digraph('')
    f.attr(rankdir='LR')
    for i,l in enumerate(layers):
        if i==0:
            for j in range(l.shape[1]):
                for k, v in enumerate(vars):
                    f.edge(v, 'L%dN%d'%(i,j), label=str(l[k,j]))
            f.node('ILI', shape='doublecircle')                    
            for k in range(intercepts[i].shape[0]): 
                f.edge('ILI', 
                       'L%dN%d'%(i,k), 
                       label=str(intercepts[i][k]))
        else:
            for j in range(l.shape[1]):
                for k in range(layers[i-1].shape[1]):
                    f.edge('L%dN%d'%(i-1,k), 
                           'L%dN%d'%(i,j), 
                           label=str(l[k,j]))
            f.node('L%dI'%(i-1), shape='doublecircle')                  
            for k in range(intercepts[i].shape[0]):
                f.edge('L%dI'%(i-1), 
                       'L%dN%d'%(i,k), 
                       label=str(intercepts[i][k]))    
                
    return f


In [None]:
np.random.seed(4567)

# Admissions

We are going to use the admissions dataset for our examples on this lab. 

This dataset contains the next variables: 
* GRE (Graduate Record Exam scores)
* GPA (Grade Point Average) and
* rank (prestige of the undergraduate institution)

This variables should affect admission into a graduate school.


 The target variable, admit/don't admit, is a binary variable, which we want to characterize
and, if possible, to predict (a model)

In [None]:
Admis = read_csv("Admissions.csv", delimiter=',')
Admis.head()

We will treat all the variables gre and gpa as continuous.

The variable rank takes on the values 1 through 4, so we can fairly treat it as numerical (although, in rigour, it is ordinal)

In [None]:
Admis.describe()

N = Admis.shape[0]

We first split the available data into learning and test sets, selecting randomly 2/3 and 1/3 of the data We do this for a honest estimation of prediction performance.

## Resampling protocol

This time we will use a Train and Test partitions and we will use the cross-validation score to compare the models. 

In [None]:
np.random.seed(63)
X_train, X_test, y_train, y_test =\
        train_test_split(Admis[['gre', 'gpa', 'rank']], Admis.admit, test_size=0.33,random_state=42)

## Pre-processing

We will use a simple pre-processing. Just scaling. 

In [None]:
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [None]:
results_df = pd.DataFrame(index=['Logistic Regression'],columns=['Accuracy (cv)', 'Mean recall (cv)'])

## Baseline: Logistic Regression

We are going to use the logistic regression as a baseline, and we will try to improve the results given by the baseline with the other models.

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
scores = cross_val_score(logreg, X_train, y_train, cv=5)
y_pred_lr = logreg.predict(X_train)

acc = np.mean(scores)
acc
confusion(y_train,y_pred_lr, ['noadmit','admit'])
recalls = np.mean(cross_val_score(logreg, X_train, y_train, cv=5,scoring='recall'))

results_df.loc['Logistic Regression', :] = [acc, recalls]

Just with the baseline we are already obtaining a 69% of accuracy. 

In [None]:
logreg.coef_
logreg.intercept_

## K-Nearest Neighbors

This model compares the sample to classify to the samples in the train set using a metric. This model has a very fast training phase, but might have a slower inference depending on the metric used or training set size. 

One remark of this model is that you can even implement your own metric adapted to your data. This can be super useful when you are working with strange shaped data. 

sklearn has already implemented the next metrics:

| identifier    | class name          | args    | distance function            |
|---------------|---------------------|---------|------------------------------|
| “euclidean”   | EuclideanDistance   |         | sqrt(sum((x - y)^2))         |
| “manhattan”   | ManhattanDistance   |         | sum(\|x - y\|)               |
| “chebyshev”   | ChebyshevDistance   |         | max(\|x - y\|)               |
| “minkowski”   | MinkowskiDistance   | p       | sum(\|x - y\|^p)^(1/p)       |
| “wminkowski”  | WMinkowskiDistance  | p, w    | sum(\|w * (x - y)\|^p)^(1/p) |
| “seuclidean”  | SEuclideanDistance  | V       | sqrt(sum((x - y)^2 / V))     |
| “mahalanobis” | MahalanobisDistance | V or VI | sqrt((x - y)' V^-1 (x - y))  |

In [None]:
myknn = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
myknn.fit(X_train, y_train)
scores = cross_val_score(myknn, X_train, y_train, cv=5)
scores_recall = cross_val_score(myknn, X_train, y_train, cv=5,scoring='recall')

y_pred = myknn.predict(X_train)

confusion(y_train,y_pred, ['noadmit','admit'])

acc=np.mean(scores)
recalls = np.mean(scores_recall)

results_df.loc['KNN', :] = [acc, recalls]

print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))

According to the accuracy the logistic regression looks like a better model, but is missclasifying a lot of the samples of the minoritary class.

Lets check what would happen if we had a model which predicted only the majoritary class:

In [None]:
(y_train == 0).sum()/len(y_train)

We would obtain a 68% of accuracy with a totally useless model.  How can we compare the two models with that much imbalance? 

In this case we will also use the mean recall. i.e. the mean between the ratio between real admited and total number of admited and the ratio between real not admited and total not admited.  

If we calculate it at hand it would be:

In [None]:
33/(33+52) # True admited / Real number of admited

We can also use sklearn implementation:

In [None]:
# Recall of LR 
recall_score(y_train, y_pred_lr,average=None)

In [None]:
# Recall of KNN
recall_score(y_train, y_pred,average=None)

From this measure you can obtain the 
* macro average: The mean of the recalls for each class.
* micro average: The global metric without taking into account the classes.

In [None]:
confusion(y_train,y_pred, ['noadmit','admit'])

In [None]:
recall_score(y_train, y_pred,average='macro'), np.mean(recall_score(y_train, y_pred,average=None))

In [None]:
recall_score(y_train, y_pred,average='micro'), (33+167)/(33+52+16+167)

Now we have another metric we can use to compare how good is our model. If the model classified all the samples as not admited it would have a recall of 0, which would discurage us to chose it as our model.

There are other metrics we can use to evaluate how well is working our model. We will talk more about them on further labs.

In [None]:
print(classification_report(y_train, y_pred,target_names=['noadmit','admit']))

Now lets really compare our two models:

In [None]:
results_df

Here we have that even though knn has less accuracy it has better recall than the LR.

## Multi Layered Perceptron (Yay! Neural Networks!)

MLP is almost the simplest of the Neural Network models. This model is based on combining linearly the input of the previous layer and applying an activation function to the result over different layers. 

When it has more than a hidden layer it is called a __Deep__ Neural Network. This hidden layers give the model complexity and expresivity, but also add more weights that you will need to train.

The most important parameters of the MLP are: 
* Architecture (number of layers and number of neurons by layer). 
* Activation function. 
* alpha (regularization term).
* Solver ‘lbfgs’, ‘sgd’, ‘adam’. 

As you can see, this model has a lot of hyperparameters.

We are going to start with a 1 neuron dummy MLP with logistic activation function and no regularization.

In [None]:
model_nnet = MLPClassifier(hidden_layer_sizes=[1],
                           alpha=0,
                           activation='logistic',
                           max_iter=200,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train,y_train);

scores = cross_val_score(model_nnet, X_train, y_train, cv=5)
scores_recall = cross_val_score(model_nnet, X_train, y_train, cv=5,scoring='recall')

y_pred = model_nnet.predict(X_train)


confusion(y_train,y_pred, ['noadmit','admit'])

acc=np.mean(scores)
recalls = np.mean(scores_recall)
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['MLP[1]', :] = [acc, recalls]

In [None]:
results_df

We can see that this dummy mlp is already winning our other models in Admit recall. Let's see how this is looking on the inside.

Here we can see this mlp weights.

In [None]:
model_nnet.coefs_
model_nnet.intercepts_

We can also look at it as a graph. Isn't this structure familiar?

In [None]:
graphMLP(Admis.columns[1:], model_nnet.coefs_, model_nnet.intercepts_)

On this graph the gre, gpa, rank and ILI represent the input layer,which would be our training samples. 
L0N0 is our hidden neuron. L01 the extra intercept and L1N0 our output neuron. 

Now that we know what we are doing, lets make a bigger network.

In [None]:
model_nnet = MLPClassifier(hidden_layer_sizes=[2,2],
                           alpha=0,
                           activation='logistic',
                           max_iter=200,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train,y_train);

scores = cross_val_score(model_nnet, X_train, y_train, cv=5)
scores_recall = cross_val_score(model_nnet, X_train, y_train, cv=5,scoring='recall')

y_pred = model_nnet.predict(X_train)


confusion(y_train,y_pred, ['noadmit','admit'])

acc=np.mean(scores)
recalls = np.mean(scores_recall)
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['MLP[2,2]', :] = [acc, recalls]

In [None]:
results_df

We have a winner!

Also, our model complexity is increasing, which might increase the training time. 

In [None]:
graphMLP(Admis.columns[1:], model_nnet.coefs_, model_nnet.intercepts_)

As you can see, some weights are large (two orders of magnitude larger then others) This is no good, since it makes the model unstable (i.e., small changes in some inputs may entail significant changes in the network, because of the large weights)

One way to avoid this is by regularizing the learning process:

In [None]:
model_nnet = MLPClassifier(hidden_layer_sizes=[2,2],
                           alpha=0.001,
                           activation='logistic',
                           max_iter=200,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train,y_train);

scores = cross_val_score(model_nnet, X_train, y_train, cv=5)
scores_recall = cross_val_score(model_nnet, X_train, y_train, cv=5,scoring='recall')

y_pred = model_nnet.predict(X_train)


confusion(y_train,y_pred, ['noadmit','admit'])

acc=np.mean(scores)
recalls = np.mean(scores_recall)
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['MLP[2,2]-alpha', :] = [acc, recalls]

We have the master of the models. And also with small weights i.e. robust.

In [None]:
results_df

In [None]:
graphMLP(Admis.columns[1:], model_nnet.coefs_, model_nnet.intercepts_)

But how can we know that this is the best possible model? Whi alpha is 0.00001 and not 25? Is it better to increase depht or increase neurons on this case?

MLP results depend *Strongly* of its hyperparameter configuration. Now we are going to see different strategies to obtain the best hyperparameters for our model.

Scikit learn has specific functions for parameter search so we can tune the parameters of a model.

 We are going to use a grid search that will use a cross validation strategy to evaluate the results for each combination of parameters. At the end the best model will be returned

 In order to find the best network architecture, we are going to explore two methods:

1. Explore different numbers of hidden units in one hidden layer, with no regularization
2. Fix a large number of hidden units in one hidden layer, and explore different regularization values (recommended)

doing both (explore different numbers of hidden units AND regularization values) is usually a waste of computing 
resources (but notice that it would admit it)

Let's start with 1.

set desired sizes

In [None]:
sizes = [2*i for i in range(1,11)]
sizes = sizes + [[2*i,2*i] for i in range(1,5)]

len(sizes), sizes

[2,[2,2]] - > 2 modelos -> entrenamos, calculamos las métricas de cv -> comparamos 

In [None]:
init_time=time()
model_nnet = MLPClassifier(alpha=0,
                           activation='logistic',
                           max_iter=500,
                           solver='lbfgs',
                           random_state=42)

trc = GridSearchCV(estimator=model_nnet,
                   scoring=['accuracy', 'recall'],
                   param_grid={'hidden_layer_sizes': sizes},
                   cv=10,
                   return_train_score=True,
                   refit='recall')

model_10CV = trc.fit(X_train, y_train)
print(timedelta(seconds=(time()-init_time)))

In [None]:
model_10CV.best_params_ , model_10CV.best_score_

Now we can use the results of the cv to decide which is the best parameter configuracion.

According to the criteria of having the greatest recall, we would choose the layer configuration [4]. But if we were usign the accuracy we would chose [2,2]

In [None]:
pd.DataFrame(model_10CV.cv_results_).loc[:,['param_hidden_layer_sizes', 'mean_test_accuracy','std_test_accuracy',
                                        'mean_test_recall','std_test_recall']]

Now the decays

In [None]:
decays = [10**i for i in np.arange(-5,0,0.1)]
print(decays[:10]) 

len(decays)

In [None]:
init_time = time()
model_nnet = MLPClassifier(activation='logistic',
                           hidden_layer_sizes=[4],
                           max_iter=500,
                           solver='lbfgs',
                           random_state=42)

trc = GridSearchCV(estimator=model_nnet,
                   scoring=['accuracy', 'recall'],
                   param_grid={'alpha': decays},
                   cv=10,
                   return_train_score=True,
                   refit='recall')

model_10CV = trc.fit(X_train, y_train)
print(timedelta(seconds=(time() - init_time)))

In [None]:
pd.DataFrame(model_10CV.cv_results_).loc[:,['param_alpha', 'mean_test_accuracy','std_test_accuracy',
                                        'mean_test_recall','std_test_recall']]

In [None]:
model_10CV.best_params_

In [None]:
model_10CV.best_score_

So,according to our experiments the best parameter configuration would be: 
* Architecture [4]
* Alpha 1.9952623149688746e-05

Lets calculate our cross-validation scores of the best model found trained (model_10CV.best_estimator_).

In [None]:
scores = cross_val_score(model_10CV.best_estimator_, X_train, y_train, cv=5)
scores_recall = cross_val_score(model_10CV.best_estimator_, X_train, y_train, cv=5,scoring='recall')

y_pred = model_10CV.predict(X_train)
confusion(y_train,y_pred, ['noadmit','admit'])

acc=np.mean(scores)
recalls = np.mean(scores_recall)
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['MLP[4]-reg', :] = [acc, recalls]

In [None]:
results_df

Lets see our final results on the test set of the best model:

In [None]:
y_pred = model_10CV.predict(X_test)

acc=accuracy_score(y_test,y_pred)

confusion(y_test,y_pred, ['noadmit','admit'])
recalls = recall_score(y_test,y_pred,average=None)
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))

recall_score(y_test,y_pred)

Finally we obtain our test metrics, which tell us that we did a good job selecting our model, as it is generalizing quite well.

# Labeled Faces in the Wild

Now we are going to try the same with a more complex dataset. 

This dataset contains images in black and white of public personalities. The task is to clasify the images with the proper name. 

Disclaimer: Even though this is an image dataset we will try it as a full numerical data. It is complex enough for showing MLP with real data and *someone* likes image data.

In [None]:
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
n_samples, h, w = lfw_people.images.shape
X = lfw_people.data
n_features = X.shape[1]

y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]


print("Total dataset size:")
print("n_samples: {}".format(n_samples))
print("n_features: {}".format(n_features))
print("n_classes: {}".format(n_classes))

## Resampling an pre-processing

We will use the same resampling and pre-processing that on the previous example.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

We can see that this dataset is quite big and it has more features than samples. That might be a problem.

In [None]:
X_train.shape

Also, we can see that this dataset is quite unbalanced. We could get almost a 40% of accuracy only with a constant model.

In [None]:
sn.countplot(y_train)

In [None]:
(y_train==3).sum()/len(y_train)

But now we have too much classes to just look at ones recall. 
We will handle this problem by taking into account the mean recacall, as well as the accuracy.

Also, we will check how much *Time* is taking our models to train.

In [None]:
results_df = pd.DataFrame(index=['KNN'],columns=['Accuracy', 'Recall (mean)', 'Time(s)'])
results_df

## Baseline: KNN

We will use knn as a baseline, because is a very fast model to train.

In [None]:
init_time = time()
myknn = KNeighborsClassifier(n_neighbors=5)
myknn.fit(X_train, y_train)

training_time = time()-init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(myknn, X_train, y_train, cv=5)
scores_recall = cross_val_score(myknn, X_train, y_train, cv=5,scoring='recall_macro')

y_pred = myknn.predict(X_train)

acc=np.mean(scores)
recalls = np.mean(scores_recall)


confusion(y_train,y_pred,target_names )
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['KNN',:] = [acc,recalls,training_time]

In [None]:
results_df

We obtain quite good results with this model. Lets see with the MLP. 

## MLP

In [None]:
init_time = time()
model_nnet = MLPClassifier(hidden_layer_sizes=[5],
                           alpha=0.01,
                           activation='logistic',
                           max_iter=200,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train,y_train);

training_time = time()-init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(model_nnet, X_train, y_train, cv=5)
scores_recall = cross_val_score(model_nnet, X_train, y_train, cv=5,scoring='recall_macro')

y_pred = model_nnet.predict(X_train)

acc=np.mean(scores)
recalls = np.mean(scores_recall)


confusion(y_train,y_pred,target_names )
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['MLP[5]-0.01',:] = [acc,recalls,training_time]

It gives better results but it is several times slower than the knn. 

If we wanted to try different hyperparameters we would need a lot of time. 

In [None]:
results_df

## PCA

Lets try another strategy instead. We can use PCA to reduce the dimensionality of the data. This way the training of the model would be faster, and might reduce the noise in the data.


First lets see how many components do we need.

In [None]:
pca = PCA().fit(X_train)

n_components =(pca.explained_variance_ratio_.cumsum() < 0.99).sum()
n_components

With just 339 of the variables, we could maintain 99 % of the variance of the data. Thas a huge reduction from the original 1850.

In [None]:
pca = PCA(n_components=n_components).fit(X_train)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

### KNN
Lets see how affects our smaller data to the knn results.

In [None]:
init_time = time()
myknn = KNeighborsClassifier(n_neighbors=5)
myknn.fit(X_train_pca, y_train)

training_time = time()-init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(myknn, X_train_pca, y_train, cv=5,scoring='accuracy')
scores_recall = cross_val_score(myknn, X_train_pca, y_train, cv=5,scoring='recall_macro')

y_pred = myknn.predict(X_train_pca)

acc=np.mean(scores)
recalls = np.mean(scores_recall)


confusion(y_train,y_pred,target_names )
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['KNN-PCA',:] = [acc,recalls,training_time]

We are obtaining the same results with a fraction of the time.

In [None]:
results_df

### MLP

In [None]:
init_time = time()
model_nnet = MLPClassifier(hidden_layer_sizes=[5],
                           alpha=0.01,
                           activation='logistic',
                           max_iter=200,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train_pca,y_train);

training_time = time()-init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(model_nnet, X_train_pca, y_train, cv=5)
scores_recall = cross_val_score(model_nnet, X_train_pca, y_train, cv=5,scoring='recall_macro')

y_pred = model_nnet.predict(X_train_pca)

acc=np.mean(scores)
recalls = np.mean(scores_recall)


confusion(y_train,y_pred,target_names )
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['MLP[5]-0.01-PCA',:] = [acc,recalls,training_time]

In [None]:
results_df

The MLP has lost a bit on both metrics but is faster, now we can try to improve its results using cross-validation.

We will use the same technique than before, but now we are testing fewer parameters, as the model takes longer to train. We will also change the number of CV partitions into 5, to go faster.

In [None]:
sizes = [2*i for i in range(1,4)]
sizes = sizes + [[2*i,2*i] for i in range(1,4)]
sizes

If we do 5-cv with 6 layer configuration and we spend 1.5 seconds with each training, we *should* spend approximately 45 seconds with the CV process. If all the architectures took the same time to train. 

If we had done it without the pca we would take more than two minutes. 

In [None]:
init_time = time()

model_nnet = MLPClassifier(alpha=0,
                           activation='logistic',
                           hidden_layer_sizes=2,
                           max_iter=500,
                           solver='lbfgs',random_state=42)

trc = GridSearchCV(estimator=model_nnet, 
                   param_grid ={'hidden_layer_sizes':sizes},
                   scoring=['accuracy', 'recall_macro'],
                   cv=5,
                   return_train_score=True,
                  refit='recall_macro')
model_5CV = trc.fit(X_train_pca,y_train)
model_5CV.best_score_
model_5CV.best_params_
print(timedelta(seconds=(time()-init_time)))

It took almost 1:21 minutes. 

The best architecture found is [6]

In [None]:
pd.DataFrame(model_5CV.cv_results_).loc[:,['mean_fit_time','param_hidden_layer_sizes', 'mean_test_accuracy','std_test_accuracy',
                                        'mean_test_recall_macro','std_test_recall_macro']]

We can see that each training took more time than the one we used to estimate. This might be because the number of neurons of the architectures. 

Now lets see the decays.

In [None]:
decays = [0, 0.1]
decays

In [None]:
init_time = time()

model_nnet = MLPClassifier(alpha=0,
                           activation='logistic',
                           hidden_layer_sizes=[6],
                           max_iter=500,
                           solver='lbfgs',random_state=42)

trc = GridSearchCV(estimator=model_nnet, 
                   param_grid ={'alpha':decays},
                   scoring=['accuracy', 'recall_macro'],
                   cv=5,
                   return_train_score=True,
                  refit='recall_macro')
model_5CV = trc.fit(X_train_pca,y_train)
model_5CV.best_score_
model_5CV.best_params_
print(timedelta(seconds=(time()-init_time)))

In [None]:
pd.DataFrame(model_5CV.cv_results_).loc[:,['mean_fit_time','param_alpha', 'mean_test_accuracy','std_test_accuracy',
                                        'mean_test_recall_macro','std_test_recall_macro']]

Now that we have found our best MLP (among these configurations). Lets check it.

In [None]:
init_time = time()
model_nnet = MLPClassifier(hidden_layer_sizes=[6],
                           alpha=0.1,
                           activation='logistic',
                           max_iter=200,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train_pca,y_train);

training_time = time()-init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(model_nnet, X_train_pca, y_train, cv=5)
scores_recall = cross_val_score(model_nnet, X_train_pca, y_train, cv=5,scoring='recall_macro')

y_pred = model_nnet.predict(X_train_pca)

acc=np.mean(scores)
recalls = np.mean(scores_recall)


confusion(y_train,y_pred,target_names )
print('Accuracy:{}\nRecalls:{}'.format(acc,recalls))
results_df.loc['MLP[6]-0.1-PCA',:] = [acc,recalls,training_time]

In [None]:
results_df

Now that we have choosen our best model. We can check if it generalizes using the test set.

In [None]:
y_pred = model_nnet.predict(X_test_pca)

acc=accuracy_score(y_test,y_pred)

confusion(y_test,y_pred, target_names)
recalls = recall_score(y_test,y_pred,average=None)
print('Accuracy:{}\nRecalls:\n{}'.format(acc,recalls))

We have obtained quite good results, and the model generalizes so it also predicts acurately on the test set.