# Validation

In [1]:
import numpy as np
np.random.seed(90)

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing, metrics
from sklearn.model_selection import KFold

%matplotlib inline

In [2]:
from chemml.datasets import load_organic_density
smiles, density, features = load_organic_density()

X = features.values
y = density.values

## Train-Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape, X_test.shape

((450, 200), (50, 200))

In [4]:
scaler = preprocessing.StandardScaler()
Xtr = scaler.fit_transform(X_train)
Xte = scaler.transform(X_test)
ytr = scaler.fit_transform(y_train).reshape(-1)

mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', alpha = 1, early_stopping=True)
mlp.fit(Xtr, ytr)

MLPRegressor(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [5]:
# metrics
yte_pred = mlp.predict(Xte)
yte_pred = scaler.inverse_transform(yte_pred)
print ('MAE (kg/m3):', metrics.mean_absolute_error(y_test, yte_pred))
density.describe()

MAE (kg/m3): 18.961619687853226


Unnamed: 0,density_Kg/m3
count,500.0
mean,1268.83838
std,90.310547
min,1005.6
25%,1207.17
50%,1264.46
75%,1327.3525
max,1614.83


## Kfold Cross-validation and model selection
<img src="images/Kfold.png">

https://sebastianraschka.com/blog/2016/model-evaluation-selection-part3.html


## implementation of grid search
Note: make sure you always have an independent test set for your final model. 

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape, X_test.shape

kf = KFold(n_splits=3, random_state=7)
alpha_grid = [10, 3, 1, .3, .1, .03, .01]
for alpha in alpha_grid:
    cv_results = []
    for train_index, test_index in kf.split(X_train):
        Xtr = X_train[train_index]
        Xte = X_train[test_index]
        ytr = y_train[train_index]
        yte = y_train[test_index]
        
        scaler = preprocessing.StandardScaler()
        Xtr = scaler.fit_transform(Xtr)
        Xte = scaler.transform(Xte)
        ytr = scaler.fit_transform(ytr).reshape(-1)

        mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', alpha = alpha, early_stopping=True, max_iter=500)
        mlp.fit(Xtr, ytr)
        ytr_pred = mlp.predict(Xtr)
        ytr_pred = scaler.inverse_transform(ytr_pred)
        ytr = scaler.inverse_transform(ytr)
        yte_pred = mlp.predict(Xte)
        yte_pred = scaler.inverse_transform(yte_pred)
        l = []
        l.append(metrics.mean_absolute_error(ytr, ytr_pred))
        l.append(metrics.mean_absolute_error(yte, yte_pred))
        cv_results.append(l)
    print ('alpha:', alpha, ' ===> (MAE train, MAE test):', np.mean(np.array(cv_results), axis = 0))



alpha: 10  ===> (MAE train, MAE test): [14.17736864 17.29318111]
alpha: 3  ===> (MAE train, MAE test): [18.57442272 22.53750452]
alpha: 1  ===> (MAE train, MAE test): [ 8.16424376 15.27596448]
alpha: 0.3  ===> (MAE train, MAE test): [17.43580967 23.08626169]
alpha: 0.1  ===> (MAE train, MAE test): [12.01743333 19.22461546]
alpha: 0.03  ===> (MAE train, MAE test): [13.70854506 20.72719069]
alpha: 0.01  ===> (MAE train, MAE test): [15.29214149 20.99748323]


## scikit-learn: cross validated grid search 

In [7]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
scaler = preprocessing.StandardScaler()
Xtr = scaler.fit_transform(X_train)
ytr = scaler.fit_transform(y_train).reshape(-1)



alpha_grid = [10, 3, 1, .3, .1, .03, .01]
mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', early_stopping=True, max_iter=500)
scorer = metrics.make_scorer(score_func = metrics.mean_absolute_error,
                            greater_is_better = False)
kf = KFold(n_splits=3, random_state=7)

gs = GridSearchCV(estimator = mlp, 
                  param_grid = {'alpha':alpha_grid},
                  scoring = scorer,
                  return_train_score=True,
                  cv = kf)
gs.fit(Xtr, ytr)
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.213708,0.094247,0.00032,2.4e-05,10.0,{'alpha': 10},-0.205322,-0.185248,-0.265235,-0.218602,0.033978,6,-0.156637,-0.159079,-0.201874,-0.17253,0.020773
1,0.208094,0.06497,0.000395,0.000126,3.0,{'alpha': 3},-0.190365,-0.164742,-0.193118,-0.182742,0.012777,1,-0.141007,-0.117927,-0.125593,-0.128176,0.009598
2,0.156837,0.027488,0.000321,2.9e-05,1.0,{'alpha': 1},-0.184723,-0.174284,-0.228579,-0.195862,0.023524,2,-0.109041,-0.119802,-0.130641,-0.119828,0.008818
3,0.129603,0.054193,0.00032,8e-06,0.3,{'alpha': 0.3},-0.224636,-0.159303,-0.23729,-0.207076,0.034174,4,-0.184037,-0.096916,-0.113903,-0.131619,0.037709
4,0.21177,0.074176,0.000403,0.000132,0.1,{'alpha': 0.1},-0.18825,-0.206289,-0.254589,-0.216376,0.028007,5,-0.087378,-0.098079,-0.144734,-0.110064,0.024902
5,0.182829,0.030477,0.000304,3e-06,0.03,{'alpha': 0.03},-0.213768,-0.204696,-0.199155,-0.205873,0.006024,3,-0.126113,-0.112465,-0.109116,-0.115898,0.007351
6,0.110698,0.045026,0.000303,6e-06,0.01,{'alpha': 0.01},-0.217031,-0.241508,-0.206548,-0.221696,0.014649,7,-0.118992,-0.216457,-0.11232,-0.149256,0.047596


## scikit-learn: cross validated randomized search 

In [8]:
from sklearn.model_selection import RandomizedSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
scaler = preprocessing.StandardScaler()
Xtr = scaler.fit_transform(X_train)
ytr = scaler.fit_transform(y_train).reshape(-1)


alpha_dist = np.random.normal(2,1,100)
mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', early_stopping=True, max_iter=500)
scorer = metrics.make_scorer(score_func = metrics.mean_absolute_error,
                            greater_is_better = False)
kf = KFold(n_splits=3, random_state=7)

rs = RandomizedSearchCV(estimator = mlp, 
                          param_distributions = {'alpha':alpha_dist},
                          n_iter= 10,
                          return_train_score=True,
                          scoring = scorer,
                          cv = kf)
rs.fit(Xtr, ytr)
pd.DataFrame(rs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.136138,0.049947,0.000309,2e-06,1.12395,{'alpha': 1.1239548412039424},-0.155818,-0.161117,-0.248893,-0.188609,0.042682,3,-0.106183,-0.103974,-0.202041,-0.137399,0.045718
1,0.238516,0.064086,0.000307,7e-06,1.38893,{'alpha': 1.3889347609605385},-0.174202,-0.149629,-0.166693,-0.163508,0.010282,1,-0.110219,-0.092155,-0.094481,-0.098952,0.008024
2,0.206273,0.122217,0.000375,2.2e-05,1.19359,{'alpha': 1.19359232170804},-0.181412,-0.186981,-0.244756,-0.204383,0.028638,6,-0.098933,-0.130793,-0.180543,-0.136756,0.033583
3,0.175457,0.076346,0.00033,2.1e-05,2.56737,{'alpha': 2.567370785497177},-0.193538,-0.153554,-0.219177,-0.188756,0.027003,4,-0.121059,-0.108779,-0.138213,-0.122684,0.012071
4,0.154707,0.036532,0.000371,6e-05,3.00063,{'alpha': 3.0006266499121046},-0.250335,-0.147231,-0.216142,-0.20457,0.04288,7,-0.195236,-0.107401,-0.123518,-0.142051,0.038178
5,0.162723,0.091528,0.000355,7.5e-05,0.130372,{'alpha': 0.13037169308180374},-0.292615,-0.189775,-0.212139,-0.23151,0.044162,10,-0.209221,-0.087171,-0.099411,-0.131934,0.054878
6,0.130724,0.034481,0.000309,8e-06,1.301,{'alpha': 1.3009950328986095},-0.202129,-0.168227,-0.257202,-0.209186,0.036665,8,-0.140968,-0.119711,-0.195768,-0.152149,0.032041
7,0.272766,0.133756,0.000402,9e-05,1.91151,{'alpha': 1.9115101512279595},-0.166796,-0.226049,-0.177833,-0.190226,0.025729,5,-0.098979,-0.192665,-0.095412,-0.129019,0.045028
8,0.238461,0.022403,0.000343,1.9e-05,1.33296,{'alpha': 1.3329561353436719},-0.177581,-0.143274,-0.183687,-0.16818,0.017787,2,-0.106701,-0.112089,-0.101589,-0.106793,0.004287
9,0.119021,0.025597,0.000466,5e-05,1.78854,{'alpha': 1.788538621284949},-0.263973,-0.187728,-0.206425,-0.219375,0.032446,9,-0.190863,-0.153055,-0.113283,-0.152401,0.031675
