In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
import pandas as pd

In [2]:
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [3]:
model = MLPClassifier()

- si distingue nel ML tra parametri di un modello e iperparmetri
 - i parametri vegono trovati nell'aprendimento (es. pesi)
 - gli iperparametri sono decisioni che prendiamo a monte (es. iterazioni, neuroni, profondità albero)
- la GRID SEARCH serve a trovare i valori ottimali degli iperparametri, in modo sistematico senza fare esperimenti a casaccio

In [4]:
# come si accede agli iperparametri di un modello?
# (combaciano in larga parte con le opzioni che passo al costruttore)
model.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [26]:
params_space = {
    'hidden_layer_sizes': [(20,), (10, 10), (10, 5, 5)],
    'learning_rate_init' : [0.001, 0.01, 0.1],
    'alpha': [0.05, 0.1]
}
grid = GridSearchCV(model, param_grid=params_space,
                    verbose=2,
                    cv=2, # quanti fold usare per la cross validation!
                    #refit=True, # cominciare omettendolo (ottieni il modello migliore), poi =True (refitti), poi ='accuracy' (scegli in base a metrica),
                    #scoring=['accuracy', 'precision_macro'] # da attivare dopo il refit
                   )

In [19]:
grid.fit(X_train, y_train)

Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.001 ..


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.001, total=   0.6s
[CV] alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.001 ..




[CV]  alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.001, total=   0.7s
[CV] alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.01 ...
[CV]  alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.01, total=   0.5s
[CV] alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.01 ...
[CV]  alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.01, total=   0.5s
[CV] alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.1 ....
[CV]  alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.1, total=   0.3s
[CV] alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.1 ....
[CV]  alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.1, total=   0.2s
[CV] alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.001 




[CV]  alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.001, total=   0.8s
[CV] alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.001 




[CV]  alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.001, total=   0.8s
[CV] alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.01 
[CV]  alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, total=   0.6s
[CV] alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.01 
[CV]  alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, total=   0.6s
[CV] alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.1 .
[CV]  alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.1, total=   0.6s
[CV] alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.1 .
[CV]  alpha=0.05, hidden_layer_sizes=(10, 10), learning_rate_init=0.1, total=   0.4s
[CV] alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001 




[CV]  alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001, total=   1.0s
[CV] alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001 




[CV]  alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001, total=   0.9s
[CV] alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01 
[CV]  alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01, total=   0.8s
[CV] alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01 




[CV]  alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01, total=   1.0s
[CV] alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1 
[CV]  alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1, total=   0.5s
[CV] alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1 
[CV]  alpha=0.05, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1, total=   0.4s
[CV] alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.001 ...




[CV]  alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.001, total=   0.7s
[CV] alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.001 ...




[CV]  alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.001, total=   0.7s
[CV] alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.01 ....
[CV]  alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.01, total=   0.6s
[CV] alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.01 ....
[CV]  alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.01, total=   0.6s
[CV] alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.1 .....
[CV]  alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.1, total=   0.2s
[CV] alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.1 .....
[CV]  alpha=0.1, hidden_layer_sizes=(20,), learning_rate_init=0.1, total=   0.2s
[CV] alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.001 




[CV]  alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.001, total=   0.8s
[CV] alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.001 




[CV]  alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.001, total=   0.8s
[CV] alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01 .




[CV]  alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, total=   0.8s
[CV] alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01 .
[CV]  alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, total=   0.7s
[CV] alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.1 ..
[CV]  alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.1, total=   0.1s
[CV] alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.1 ..
[CV]  alpha=0.1, hidden_layer_sizes=(10, 10), learning_rate_init=0.1, total=   0.2s
[CV] alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001 




[CV]  alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001, total=   0.9s
[CV] alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001 




[CV]  alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.001, total=   0.9s
[CV] alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01 




[CV]  alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01, total=   1.0s
[CV] alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01 
[CV]  alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.01, total=   0.6s
[CV] alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1 
[CV]  alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1, total=   0.5s
[CV] alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1 
[CV]  alpha=0.1, hidden_layer_sizes=(10, 5, 5), learning_rate_init=0.1, total=   0.8s


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   22.0s finished


GridSearchCV(cv=2, estimator=MLPClassifier(),
             param_grid={'alpha': [0.05, 0.1],
                         'hidden_layer_sizes': [(20,), (10, 10), (10, 5, 5)],
                         'learning_rate_init': [0.001, 0.01, 0.1]},
             verbose=2)

In [20]:
grid.best_params_

{'alpha': 0.05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.01}

In [21]:
grid.best_estimator_

MLPClassifier(alpha=0.05, hidden_layer_sizes=(20,), learning_rate_init=0.01)

In [22]:
grid.best_score_

0.9524803682523446

In [23]:
best_model = grid.best_estimator_
best_model.predict(X_test)
best_model.score(X_test, y_test)

0.9444444444444444

In [24]:
best_model.get_params()

{'activation': 'relu',
 'alpha': 0.05,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (20,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.01,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

- ATTENZIONE: il best estimator è addestrato comunque su una fold
- Per avere in automatico il miglior modello refittato su tutto il training set, **refit=True**

In [25]:
df = pd.DataFrame(grid.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,param_learning_rate_init,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.655564,0.026068,0.002274,0.000325,0.05,"(20,)",0.001,"{'alpha': 0.05, 'hidden_layer_sizes': (20,), '...",0.931751,0.936107,0.933929,0.002178,3
1,0.485635,0.029545,0.001703,1.6e-05,0.05,"(20,)",0.01,"{'alpha': 0.05, 'hidden_layer_sizes': (20,), '...",0.961424,0.943536,0.95248,0.008944,1
2,0.207257,0.049581,0.001574,2.1e-05,0.05,"(20,)",0.1,"{'alpha': 0.05, 'hidden_layer_sizes': (20,), '...",0.206231,0.197623,0.201927,0.004304,14
3,0.751563,0.004329,0.002503,0.000407,0.05,"(10, 10)",0.001,"{'alpha': 0.05, 'hidden_layer_sizes': (10, 10)...",0.885757,0.881129,0.883443,0.002314,6
4,0.616319,0.020566,0.001794,1.4e-05,0.05,"(10, 10)",0.01,"{'alpha': 0.05, 'hidden_layer_sizes': (10, 10)...",0.816024,0.860327,0.838175,0.022152,9
5,0.413933,0.039281,0.057791,0.056066,0.05,"(10, 10)",0.1,"{'alpha': 0.05, 'hidden_layer_sizes': (10, 10)...",0.307122,0.106984,0.207053,0.100069,13
6,0.93186,0.045915,0.002122,8.8e-05,0.05,"(10, 5, 5)",0.001,"{'alpha': 0.05, 'hidden_layer_sizes': (10, 5, ...",0.609792,0.736999,0.673395,0.063603,12
7,0.894108,0.075525,0.001963,6.8e-05,0.05,"(10, 5, 5)",0.01,"{'alpha': 0.05, 'hidden_layer_sizes': (10, 5, ...",0.480712,0.887073,0.683892,0.20318,11
8,0.465982,0.034944,0.001806,1.4e-05,0.05,"(10, 5, 5)",0.1,"{'alpha': 0.05, 'hidden_layer_sizes': (10, 5, ...",0.106825,0.106984,0.106904,7.9e-05,16
9,0.713997,0.010929,0.001968,0.000188,0.1,"(20,)",0.001,"{'alpha': 0.1, 'hidden_layer_sizes': (20,), 'l...",0.921365,0.943536,0.932451,0.011086,4


In [15]:
df.sort_values(by='mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,param_learning_rate_init,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
1,0.356334,0.025611,0.001761,5.1e-05,0.05,"(20,)",0.01,"{'alpha': 0.05, 'hidden_layer_sizes': (20,), '...",0.968843,0.955423,0.962133,0.00671,1
10,0.451223,0.011498,0.00181,8.8e-05,0.1,"(20,)",0.01,"{'alpha': 0.1, 'hidden_layer_sizes': (20,), 'l...",0.95549,0.959881,0.957685,0.002196,2
0,0.658903,0.022093,0.00202,0.000182,0.05,"(20,)",0.001,"{'alpha': 0.05, 'hidden_layer_sizes': (20,), '...",0.948071,0.943536,0.945804,0.002267,3
9,0.68947,0.004162,0.001986,2.4e-05,0.1,"(20,)",0.001,"{'alpha': 0.1, 'hidden_layer_sizes': (20,), 'l...",0.924332,0.946508,0.93542,0.011088,4
12,0.795776,0.009247,0.002097,7.3e-05,0.1,"(10, 10)",0.001,"{'alpha': 0.1, 'hidden_layer_sizes': (10, 10),...",0.925816,0.878158,0.901987,0.023829,5


In [16]:
df.describe()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,0.560641,0.063787,0.001974,0.0001334614,0.600561,0.613918,0.607239,0.030697,9.5
std,0.234117,0.077961,0.000308,0.0002166169,0.363876,0.377108,0.365929,0.049536,5.338539
min,0.13848,0.003014,0.001564,5.960464e-07,0.102374,0.098068,0.100221,7.9e-05,1.0
25%,0.400904,0.013011,0.001767,1.591444e-05,0.193991,0.106984,0.147516,0.002452,5.25
50%,0.589079,0.028774,0.001934,6.347895e-05,0.81454,0.836553,0.803253,0.009665,9.5
75%,0.770717,0.094986,0.002086,0.0001327693,0.912092,0.890416,0.897724,0.042199,13.75
max,0.907367,0.304476,0.002682,0.0008713007,0.968843,0.959881,0.962133,0.204625,18.0


- si può attivare una metrica custom (vedi scoring al costruttore della Grid)
 - di default è accuracy per i classificatori e r2 per i regresssori