<a href="https://colab.research.google.com/github/msrepo/ml-mscise-2023/blob/master/model_evaluation/nested_cross_validation_for_algorithm_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Original Notebook by: **Sebastian Raschka**

## A "nested cross-validation for algorithm selection" example using scikit-learn

In [1]:
%%capture
!pip install mlxtend

In [29]:
import warnings
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning

warnings.simplefilter("ignore",ConvergenceWarning)
warnings.simplefilter("ignore",LineSearchWarning)

import numpy as np
from mlxtend.data import mnist_data
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import random

np.random.seed(1)
random.seed(1)

In [30]:
# load and split the dataset
# Note that this is a small (stratified) subset
# of MNIST; it consists of 5000 samples only, that is,
# 10% of the original MNIST dataset
X, y = mnist_data()
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size =0.8,
                                                    random_state=1,
                                                    stratify=y)
# Initializing Classifiers
clf1 = LogisticRegression(multi_class='multinomial',
                          solver='newton-cg',
                          max_iter=1000,
                          random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = SVC(random_state=1)

# build pipeline
pipe1 = Pipeline([('std',StandardScaler()),
                  ('clf1',clf1)])
pipe2 = Pipeline([('std', StandardScaler()),
                  ('clf2',clf2)])
pipe4 = Pipeline([('std',StandardScaler()),
                  ('clf4',clf4)])

# setup the parameters grids
param_grid1 = [{'clf1__penalty':['l2'],
                'clf1__C':np.power(10.,np.arange(-4,4))}]

param_grid2 = [{'clf2__n_neighbors':list(range(1,10)),
                'clf2__p':[1,2]}]

# set up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1,param_grid2),
                            (pipe1,pipe2),
                            ('Logistic','KNN')):
  gcv = GridSearchCV(estimator=est,
                     param_grid=pgrid,
                     scoring='accuracy',
                     n_jobs=1,
                     cv=2,
                     verbose=0,
                     refit=True)
  gridcvs[name] = gcv



In [31]:
cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)

# the outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train, y_train):
  for name, gs_est in sorted(gridcvs.items()):
    print(f'outer fold {c}/5 | tuning {name:8s}',end='')

    # The inner loop for hyperparameter tuning
    gs_est.fit(X_train[outer_train_idx],y_train[outer_train_idx])
    y_pred = gs_est.predict(X_train[outer_valid_idx])
    acc = accuracy_score(y_true=y_train[outer_valid_idx], y_pred=y_pred)
    print(f' | inner ACC {gs_est.best_score_:.2f} | outer ACC {acc * 100:.2f}')
    cv_scores[name].append(acc)
  c += 1


outer fold 1/5 | tuning KNN      | inner ACC 0.88 | outer ACC 91.62
outer fold 1/5 | tuning Logistic | inner ACC 0.89 | outer ACC 90.00
outer fold 2/5 | tuning KNN      | inner ACC 0.89 | outer ACC 91.88
outer fold 2/5 | tuning Logistic | inner ACC 0.89 | outer ACC 91.00
outer fold 3/5 | tuning KNN      | inner ACC 0.89 | outer ACC 90.88
outer fold 3/5 | tuning Logistic | inner ACC 0.89 | outer ACC 90.00
outer fold 4/5 | tuning KNN      | inner ACC 0.89 | outer ACC 90.88
outer fold 4/5 | tuning Logistic | inner ACC 0.89 | outer ACC 90.75
outer fold 5/5 | tuning KNN      | inner ACC 0.88 | outer ACC 90.25
outer fold 5/5 | tuning Logistic | inner ACC 0.89 | outer ACC 89.50


In [37]:
# looking at the results
for name in cv_scores.keys():
  print(f'{name:8s} | outer CV acc: {100* np.mean(cv_scores[name]):.2f} +/- {100*np.std(cv_scores[name]):.3f}')

print(f'\n Logistic Regression Best parameters {gridcvs["Logistic"].best_params_}')

Logistic | outer CV acc: 90.25 +/- 0.548
KNN      | outer CV acc: 91.10 +/- 0.583

 Logistic Regression Best parameters {'clf1__C': 0.01, 'clf1__penalty': 'l2'}


In [38]:
# Fitting the models to the whole training set
# using the 'best' hyperparameters
for name in cv_scores.keys():
  name_best_algo = gridcvs[name]

  name_best_algo.fit(X_train, y_train)
  train_acc = accuracy_score(y_true=y_train, 
                             y_pred=name_best_algo.predict(X_train))
  test_acc = accuracy_score(y_true=y_test, 
                            y_pred=name_best_algo.predict(X_test))

  print(f'Algorithm: {name:8s} Accuracy: {100 * name_best_algo.best_score_:.2f} (avg over CV test folds)')
  print(f'Training Accuracy: {100 * train_acc:.2f}')
  print(f'Test Accuracy: {100 * test_acc:.2f}\n')


Logistic Accuracy: 89.22 (avg over CV test folds)
Training Accuracy: 99.50
Test Accuracy: 89.30
KNN      Accuracy: 89.33 (avg over CV test folds)
Training Accuracy: 100.00
Test Accuracy: 91.80
