# Validation

In [1]:
import numpy as np
np.random.seed(90)

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing, metrics
from sklearn.model_selection import KFold

%matplotlib inline

In [2]:
from chemml.datasets import load_organic_density
smiles, density, features = load_organic_density()

X = features.values
y = density.values

## Train-Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape, X_test.shape

((450, 200), (50, 200))

In [5]:
scaler = preprocessing.StandardScaler()
Xtr = scaler.fit_transform(X_train)
Xte = scaler.transform(X_test)
ytr = scaler.fit_transform(y_train).reshape(-1)

mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', alpha = 1, early_stopping=True)
mlp.fit(Xtr, ytr)

MLPRegressor(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [8]:
# metrics
yte_pred = mlp.predict(Xte)
yte_pred = scaler.inverse_transform(yte_pred)
print ('MAE (kg/m3):', metrics.mean_absolute_error(y_test, yte_pred))
density.describe()

MAE (kg/m3): 18.238263088413763


Unnamed: 0,density_Kg/m3
count,500.0
mean,1268.83838
std,90.310547
min,1005.6
25%,1207.17
50%,1264.46
75%,1327.3525
max,1614.83


## Kfold Cross-validation and model selection
<img src="images/Kfold.png">

https://sebastianraschka.com/blog/2016/model-evaluation-selection-part3.html


## implementation of grid search
Note: make sure you always have an independent test set for your final model. 

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape, X_test.shape

kf = KFold(n_splits=3, random_state=7)
alpha_grid = [10, 3, 1, .3, .1, .03, .01]
for alpha in alpha_grid:
    cv_results = []
    for train_index, test_index in kf.split(X_train):
        Xtr = X_train[train_index]
        Xte = X_train[test_index]
        ytr = y_train[train_index]
        yte = y_train[test_index]
        
        scaler = preprocessing.StandardScaler()
        Xtr = scaler.fit_transform(Xtr)
        Xte = scaler.transform(Xte)
        ytr = scaler.fit_transform(ytr).reshape(-1)

        mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', alpha = alpha, early_stopping=True, max_iter=500)
        mlp.fit(Xtr, ytr)
        ytr_pred = mlp.predict(Xtr)
        ytr_pred = scaler.inverse_transform(ytr_pred)
        ytr = scaler.inverse_transform(ytr)
        yte_pred = mlp.predict(Xte)
        yte_pred = scaler.inverse_transform(yte_pred)
        l = []
        l.append(metrics.mean_absolute_error(ytr, ytr_pred))
        l.append(metrics.mean_absolute_error(yte, yte_pred))
        cv_results.append(l)
    print ('alpha:', alpha, ' ===> (MAE train, MAE test):', np.mean(np.array(cv_results), axis = 0))



alpha: 10  ===> (MAE train, MAE test): [15.23751792 18.84963112]




alpha: 3  ===> (MAE train, MAE test): [11.91337799 16.92277875]
alpha: 1  ===> (MAE train, MAE test): [10.20280647 16.37784094]
alpha: 0.3  ===> (MAE train, MAE test): [13.51471202 20.05170823]
alpha: 0.1  ===> (MAE train, MAE test): [13.38289304 20.02478504]
alpha: 0.03  ===> (MAE train, MAE test): [10.04085307 18.49331825]
alpha: 0.01  ===> (MAE train, MAE test): [12.3448298  18.31721839]


## scikit-learn: cross validated grid search 

In [14]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
scaler = preprocessing.StandardScaler()
Xtr = scaler.fit_transform(X_train)
ytr = scaler.fit_transform(y_train).reshape(-1)



alpha_grid = [10, 3, 1, .3, .1, .03, .01]
mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', early_stopping=True, max_iter=500)
scorer = metrics.make_scorer(score_func = metrics.mean_absolute_error,
                            greater_is_better = False)
kf = KFold(n_splits=3, random_state=7)

gs = GridSearchCV(estimator = mlp, 
                  param_grid = {'alpha':alpha_grid},
                  scoring = scorer,
                  return_train_score=True,
                  cv = kf)
gs.fit(Xtr, ytr)
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.17095,0.005329,0.000306,7e-06,10.0,{'alpha': 10},-0.19287,-0.179316,-0.210757,-0.194314,0.012876,3,-0.155945,-0.151966,-0.161627,-0.156512,0.003964
1,0.148687,0.03413,0.000333,3.3e-05,3.0,{'alpha': 3},-0.200266,-0.198535,-0.191272,-0.196691,0.003896,4,-0.130684,-0.149744,-0.124202,-0.134877,0.010841
2,0.243442,0.071376,0.000312,5e-06,1.0,{'alpha': 1},-0.215888,-0.186293,-0.200674,-0.200952,0.012084,5,-0.128911,-0.09568,-0.086731,-0.103774,0.018146
3,0.187723,0.021972,0.000308,6e-06,0.3,{'alpha': 0.3},-0.198911,-0.186157,-0.1861,-0.190389,0.006026,1,-0.098797,-0.110471,-0.083577,-0.097615,0.011011
4,0.117338,0.047101,0.000317,2e-06,0.1,{'alpha': 0.1},-0.214137,-0.187089,-0.178348,-0.193191,0.015235,2,-0.178167,-0.131883,-0.099267,-0.136439,0.032371
5,0.164962,0.025825,0.000306,1e-05,0.03,{'alpha': 0.03},-0.237846,-0.169177,-0.249014,-0.218679,0.035299,6,-0.111898,-0.095329,-0.110511,-0.105913,0.007505
6,0.0617,0.008326,0.000332,2.7e-05,0.01,{'alpha': 0.01},-0.330433,-0.23494,-0.264195,-0.276523,0.039947,7,-0.287433,-0.158001,-0.16715,-0.204195,0.058977


## scikit-learn: cross validated randomized search 

In [16]:
from sklearn.model_selection import RandomizedSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
scaler = preprocessing.StandardScaler()
Xtr = scaler.fit_transform(X_train)
ytr = scaler.fit_transform(y_train).reshape(-1)


alpha_dist = np.random.normal(2,1,100)
mlp = MLPRegressor(hidden_layer_sizes = (20,10), activation= 'relu', early_stopping=True, max_iter=500)
scorer = metrics.make_scorer(score_func = metrics.mean_absolute_error,
                            greater_is_better = False)
kf = KFold(n_splits=3, random_state=7)

rs = RandomizedSearchCV(estimator = mlp, 
                          param_distributions = {'alpha':alpha_dist},
                          n_iter= 10,
                          return_train_score=True,
                          scoring = scorer,
                          cv = kf)
rs.fit(Xtr, ytr)
pd.DataFrame(rs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.137174,0.062394,0.000337,3.9e-05,2.84606,{'alpha': 2.84605952174712},-0.16062,-0.308431,-0.196659,-0.221903,0.062928,8,-0.1031,-0.266873,-0.126784,-0.165586,0.072271
1,0.11357,0.034573,0.000308,8e-06,2.11888,{'alpha': 2.1188751416137106},-0.227057,-0.17697,-0.272988,-0.225672,0.039211,9,-0.156759,-0.141537,-0.199603,-0.165966,0.024583
2,0.152839,0.05194,0.000311,9e-06,1.69016,{'alpha': 1.6901565332922655},-0.166856,-0.187615,-0.233242,-0.195904,0.027729,5,-0.097618,-0.127302,-0.142998,-0.122639,0.018817
3,0.201984,0.005474,0.000315,4e-06,2.27638,{'alpha': 2.2763757152988484},-0.156041,-0.167543,-0.167747,-0.163777,0.005471,1,-0.09967,-0.118857,-0.100775,-0.106434,0.008796
4,0.135434,0.042849,0.000305,7e-06,1.7632,{'alpha': 1.7631960347332827},-0.185088,-0.180653,-0.201461,-0.189067,0.008949,4,-0.115208,-0.152562,-0.121055,-0.129608,0.016405
5,0.193934,0.038605,0.000313,1e-06,1.82029,{'alpha': 1.8202905214998897},-0.185369,-0.181012,-0.180566,-0.182316,0.002166,2,-0.108608,-0.105897,-0.119722,-0.111409,0.005981
6,0.151731,0.104668,0.000309,4e-06,1.14247,{'alpha': 1.1424733755186098},-0.212865,-0.2315,-0.182193,-0.208853,0.020329,7,-0.14737,-0.178812,-0.081592,-0.135925,0.040507
7,0.099629,0.068515,0.000323,1.3e-05,2.00163,{'alpha': 2.0016347028039596},-0.347418,-0.18449,-0.29747,-0.276459,0.068154,10,-0.26596,-0.12492,-0.252032,-0.214304,0.063459
8,0.158105,0.024431,0.000305,6e-06,3.11995,{'alpha': 3.1199501094962585},-0.186773,-0.195555,-0.239059,-0.207129,0.022861,6,-0.129467,-0.149035,-0.156767,-0.14509,0.011489
9,0.261061,0.05322,0.000333,2.7e-05,1.72914,{'alpha': 1.7291407829914973},-0.198973,-0.163569,-0.193435,-0.185326,0.015549,3,-0.112176,-0.1075,-0.106768,-0.108815,0.002395


## chemml: evolutionary algorithm