In [1]:
import pandas as pd # data frames
import numpy as np

In [2]:
# read data
df = pd.read_csv("https://raw.githubusercontent.com/askoshiyama/mli-cohort3/master/boston.csv")
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,T1
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


Repeated K Fold CV (5 outer, 5 inner, repeat 5 times):
RepeatedKFold repeats K-Fold n times. It can be used when one requires to run KFold n times, producing different splits in each repetition.
https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators

In [3]:
# defining vars
input_vars = ["V1", "V2", "V3", "V4", "V5", "V6", "V7",
              "V8", "V9", "V10", "V11", "V12", "V13"]
target = ["T1"]

In [4]:
X = df.drop(labels=target, axis=1)
X.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [5]:
y = df.drop(labels=input_vars, axis=1)
y.head()

Unnamed: 0,T1
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [20]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from interpret.glassbox import ExplainableBoostingRegressor 
from sklearn.svm import SVR

# define the models for selection along with their hyperparameters
# we try to find meaningful ranges for each hyperparameter. Among the models chosen for selection,
# we also include an MLP neural network as it presents the most recent advancement in supervised learning domain.
models_and_parameters = {
    'linReg': (LinearRegression(), {}),
    'ridge': (Ridge(), 
              {'ridge__alpha': np.linspace(0.00001, 0.8, num=50)}),
    'svr': (SVR(kernel='rbf'),
              {'svr__C': [0.01, 0.05, 0.1, 0.5, 1], 
               'svr__gamma': [0.001, 0.0001, 0.00001]}),
    'krr': (KernelRidge(), 
              {'krr__kernel': ["poly","rbf"],
               'krr__degree': [2,3,4],
               'krr__alpha': np.linspace(0.00001, 0.8, num=50)}),
    'dtr': (DecisionTreeRegressor(),
              {"dtr__min_samples_split": [2, 10, 20, 40], 
               "dtr__max_depth": [2, 6, 8],
               "dtr__min_samples_leaf": [1, 20, 40, 100],
               "dtr__max_leaf_nodes": [None, 5, 20, 100]}),
    'rf': (RandomForestRegressor(n_estimators=100),
              {'rf__max_depth': [5, 10, 50, 100, 200, 500]}),
    'gbr': (GradientBoostingRegressor(),
              {"gbr__max_depth": [2, 6, 8],
               "gbr__learning_rate": [0.1, 0.01, 0.001],
               "gbr__n_estimators": [50, 100, 150]}),
    'ebr': (ExplainableBoostingRegressor(), {}),
}
#     'mlp': (MLPRegressor(early_stopping=True),
#                {'mlp__hidden_layer_sizes': [(2,),(4,),(8,),(16,),(32,),(64,),(128,)],
#                 'mlp__activation': ['relu'],
#                 'mlp__solver':['lbfgs', 'adam', 'sgd'], 
#                 'mlp__alpha':[0.0001],
#                 'mlp__batch_size':['auto'], 
#                 'mlp__learning_rate':['constant'],
#                 'mlp__learning_rate_init':[0.01, 0.001, 0.0001], 
#                 'mlp__max_iter':[800]})

In [24]:
# based on: https://datascience.stackexchange.com/questions/13185/nested-cross-validation-and-selecting-the-best-regression-model-is-this-the-ri
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
import numpy as np
from sklearn import metrics



#######################################
# configuration
REPEAT = 1
SEED = None  # will ensure different results on each iteration
K_FOLD_NUM = 5
METRIC = 'neg_mean_squared_error'
#######################################

# use nested cross-validation for grid search and validation
# creates 5 folds for estimating generalization error
outer_cv = KFold(n_splits=K_FOLD_NUM, random_state=SEED, shuffle=True)

# when we train on a certain fold, we use a second cross-validation
# split in order to choose hyperparameters
inner_cv = KFold(n_splits=K_FOLD_NUM, random_state=SEED, shuffle=True)

avg_scores = dict()
for name in models_and_parameters.keys():
    avg_scores[name] = dict()
    avg_scores[name]['scores'] = []


for i in range(1, REPEAT+1):
    for name, (model, params) in models_and_parameters.items():    
        print(f"{i} - {name}")
        # first scale the data, e.g. required for SVM, Ridge
        steps = [('scale', StandardScaler()), (name, model)]
        pipe = Pipeline(steps)
        
        # choose best hyperparameters for model using KFold inner_cv
        model_with_gridsearch = GridSearchCV(
            estimator=pipe, 
            param_grid=params, 
            iid=False,
            cv=inner_cv, 
            scoring=METRIC
        )        

        # estimate generalization error on the K-fold splits of the data
        # average test set scores over several dataset splits (defined by outer_cv)
        scores_across_outer_folds = cross_val_score(
            model_with_gridsearch,
            X, y.values.ravel(), 
            cv=outer_cv, 
            scoring=METRIC)
        
        avg_score_model = np.mean(scores_across_outer_folds)
        avg_scores[name]['scores'].append(avg_score_model)
        # print(f"[{i}] Model: {name}:\tAverage MSE in the outer folds: {avg_score_model}")
        # print()

print()
print("Avg scores per model over 5 repetitions:")
for name, vals in avg_scores.items():
    scores = vals['scores']
    avg_score = np.mean(scores)
    print(f"{name}:\t{avg_score}") # \tscores: {scores}
    avg_scores[name]['avg_score'] = avg_score
    
many_stars = '\n' + '*' * 20 + '\n'
print(many_stars + 'choose the best model and refit on the whole dataset' + many_stars)

print(many_stars + '\n' + str(avg_scores) + '\n' + many_stars)

best_model_name, best_model_avg_score = max(
    avg_scores.items(),
    key=(lambda name_s_as: name_s_as[1]['avg_score']))

print(f"Best model: {best_model_name}\nBest avg score: {best_model_avg_score['avg_score']}")
      
best_model, best_model_params = models_and_parameters[best_model_name]
      
# now we refit this best model on the whole dataset so that we can start
# making predictions on other data, and now we have a reliable estimate of
# this model's generalization error and we are confident this is the best model
# among the ones we have tried
steps = [('scale', StandardScaler()), (best_model_name, best_model)]
pipe = Pipeline(steps)
final_regressor = GridSearchCV(
    estimator=pipe,
    param_grid=best_model_params,
    iid=False,
    cv=inner_cv, 
    scoring=METRIC)
  
final_regressor.fit(X, y.values.ravel())

print('Best model: \n\t{}'.format(best_model), end='\n\n')
print('Estimation of its generalization error (negative mean squared error):\n\t{}'.format(
    best_model_avg_score), end='\n\n')
print('Best parameter choice for this model: \n\t{params}'
      '\n(according to cross-validation `{cv}` on the whole dataset).'.format(
      params=final_regressor.best_params_, cv=inner_cv))



1 - linReg
1 - ridge
1 - svr
1 - krr
1 - dtr
1 - rf
1 - gbr
1 - ebr

Avg scores per model over 5 repetitions:
linReg:	-24.74043147037029
ridge:	-23.356545040361322
svr:	-64.20733923678839
krr:	-12.977789298691993
dtr:	-24.103768310299422
rf:	-11.828537843586556
gbr:	-10.309008689793977
ebr:	-13.778974373615373

********************
choose the best model and refit on the whole dataset
********************


********************

{'linReg': {'scores': [-24.74043147037029], 'avg_score': -24.74043147037029}, 'ridge': {'scores': [-23.356545040361322], 'avg_score': -23.356545040361322}, 'svr': {'scores': [-64.20733923678839], 'avg_score': -64.20733923678839}, 'krr': {'scores': [-12.977789298691993], 'avg_score': -12.977789298691993}, 'dtr': {'scores': [-24.103768310299422], 'avg_score': -24.103768310299422}, 'rf': {'scores': [-11.828537843586556], 'avg_score': -11.828537843586556}, 'gbr': {'scores': [-10.309008689793977], 'avg_score': -10.309008689793977}, 'ebr': {'scores': [-13.778974373615

ValueError: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. If this is not needed, refit should be set to False explicitly. True was passed.