In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.set_printoptions(precision=5)

import pandas as pd
pd.set_option('display.precision', 5)

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

iris = load_iris()

In [3]:
clf = LogisticRegression()
scores = cross_validate(clf, iris.data, iris.target, scoring='accuracy', cv=5,
                        return_train_score=True, return_estimator=True)

In [4]:
scores['train_score'], scores['test_score']

(array([0.96667, 0.96667, 0.98333, 0.98333, 0.975  ]),
 array([0.96667, 1.     , 0.93333, 0.96667, 1.     ]))

In [5]:
scores['estimator']

[LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression()]

In [6]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression

iris = load_iris()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [7]:
clf = LogisticRegression()
scores = cross_validate(clf, iris.data, iris.target, scoring='accuracy', cv=kfold,
                        return_train_score=True, return_estimator=True)

In [8]:
scores['train_score'], scores['test_score']

(array([0.975  , 0.98333, 0.98333, 0.96667, 0.975  ]),
 array([0.96667, 1.     , 0.93333, 1.     , 0.9    ]))

In [9]:
from sklearn.tree import DecisionTreeClassifier

clf2 = DecisionTreeClassifier()
scores2 = cross_validate(clf2, iris.data, iris.target, scoring='accuracy', cv=kfold,
                        return_train_score=True, return_estimator=True)

In [10]:
scores2['train_score'], scores2['test_score']

(array([1., 1., 1., 1., 1.]),
 array([0.93333, 1.     , 0.9    , 1.     , 0.86667]))

In [11]:
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

iris = load_iris()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [12]:
score_train = []
score_test = []
for train_idx, test_idx in kfold.split(iris.data, iris.target):
                                       
    X_train = iris.data[train_idx]
    y_train = iris.target[train_idx]  
    X_test = iris.data[test_idx]
    y_test = iris.target[test_idx]
    
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    y_train_hat = clf.predict(X_train)
    score_train.append(accuracy_score(y_train, y_train_hat))
    y_test_hat = clf.predict(X_test)
    score_test.append(accuracy_score(y_test, y_test_hat))

In [13]:
score_train, score_test

([0.975, 0.9833333333333333, 0.9833333333333333, 0.9666666666666667, 0.975],
 [0.9666666666666667, 1.0, 0.9333333333333333, 1.0, 0.9])

In [14]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
scaler = StandardScaler()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [15]:
score_train = []
score_test = []

for train_idx, test_idx in kfold.split(iris.data, iris.target):
                                       
    X_train = iris.data[train_idx]
    y_train = iris.target[train_idx]  
    X_test = iris.data[test_idx]
    y_test = iris.target[test_idx]
    
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = MLPClassifier(max_iter=1000, random_state=0)
    clf.fit(X_train_scaled, y_train)

    y_train_hat = clf.predict(X_train_scaled)
    score_train.append(accuracy_score(y_train, y_train_hat))
    y_test_hat = clf.predict(X_test_scaled)
    score_test.append(accuracy_score(y_test, y_test_hat))

In [16]:
score_train, score_test

([0.9833333333333333, 0.9833333333333333, 1.0, 0.975, 0.9833333333333333],
 [0.9, 1.0, 0.9666666666666667, 1.0, 0.9])

In [17]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        
        clf = SVC(gamma=gamma, C=C)
        clf.fit(X_train_scaled, y_train)

        y_test_hat = clf.predict(X_test_scaled)
        score = accuracy_score(y_test, y_test_hat)
        
        if score > best_score:
            best_score = score
            best_hyperparameters = {'C': C, 'gamma': gamma}
            
print('Best score: {:.5f}'.format(best_score))
print('Best hyperparameters: {}'.format(best_hyperparameters))

Best score: 0.97368
Best hyperparameters: {'C': 100, 'gamma': 0.001}


In [19]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

iris = load_iris()
X_trainval, X_test, y_trainval, y_test = train_test_split(
    iris.data, iris.target, test_size=0.25, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=1)

print('Size of training set: {} size of validation set: {} size of test set:'
      ' {}\n'.format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

Size of training set: 84 size of validation set: 28 size of test set: 38



In [20]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [21]:
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        clf = SVC(gamma=gamma, C=C)
        clf.fit(X_train_scaled, y_train)

        y_valid_hat = clf.predict(X_valid_scaled)
        score = accuracy_score(y_valid, y_valid_hat)
        
        if score > best_score:
            best_score = score
            best_hyperparameters = {'C': C, 'gamma': gamma}
            
print('Best score on validation set: {:.5f}'.format(best_score))
print('Best hyperparameters: {}'.format(best_hyperparameters))

Best score on validation set: 0.92857
Best hyperparameters: {'C': 100, 'gamma': 0.001}


In [22]:
scaler.fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

clf = SVC(**best_hyperparameters)
clf.fit(X_trainval_scaled, y_trainval)

y_test_hat = clf.predict(X_test_scaled)
test_score = accuracy_score(y_test, y_test_hat)
print('Test set score with best hyperparameters: {:.5f}'.format(test_score))

Test set score with best hyperparameters: 0.97368


In [23]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

iris = load_iris()
X_trainval, X_test, y_trainval, y_test = train_test_split(
    iris.data, iris.target, test_size=0.25, random_state=0)

scaler = StandardScaler()
scaler.fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

In [24]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
hyperparam_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                   'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(SVC(), hyperparam_grid, scoring='accuracy', refit=True, cv=kfold)
grid_search.fit(X_trainval_scaled, y_trainval)

print('Best score on validation set: {:.5f}'.format(grid_search.best_score_))
print('Best hyperparameters: {}'.format(grid_search.best_params_))

Best score on validation set: 0.97312
Best hyperparameters: {'C': 10, 'gamma': 0.1}


In [25]:
y_test_hat = grid_search.predict(X_test_scaled)
test_score = accuracy_score(y_test, y_test_hat)
print('Test set score with best hyperparameters: {:.5f}'.format(test_score))

Test set score with best hyperparameters: 0.97368


In [26]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

iris = load_iris()
X_trainval, X_test, y_trainval, y_test = train_test_split(
    iris.data, iris.target, test_size=0.25, random_state=0)

scaler = StandardScaler()
scaler.fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

In [27]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        
        clf = SVC(gamma=gamma, C=C)
        scores = cross_validate(clf, X_trainval_scaled, y_trainval, scoring='accuracy', cv=kfold,
                                return_train_score=True)
        score = scores['test_score'].mean()

        if score > best_score:
            best_score = score
            best_hyperparameters = {'C': C, 'gamma': gamma}

print('Best score on validation set: {:.5f}'.format(best_score))
print('Best hyperparameters: {}'.format(best_hyperparameters))

Best score on validation set: 0.97312
Best hyperparameters: {'C': 100, 'gamma': 0.01}


In [28]:
clf = SVC(**best_hyperparameters)
clf.fit(X_trainval_scaled, y_trainval)

y_test_hat = clf.predict(X_test_scaled)
test_score = accuracy_score(y_test, y_test_hat)
print('Test set score with best hyperparameters: {:.5f}'.format(test_score))

Test set score with best hyperparameters: 0.97368


In [29]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        
        clf = SVC(gamma=gamma, C=C)
        scores = cross_validate(clf, X_trainval_scaled, y_trainval, scoring='accuracy', cv=kfold,
                                return_train_score=True, return_estimator=True)
        score = scores['test_score'].mean()

        if score > best_score:
            best_score = score
            best_models = scores['estimator']
            best_hyperparameters = {'C': C, 'gamma': gamma}

print('Best score on validation set: {:.5f}'.format(best_score))
print('Best hyperparameters: {}'.format(best_hyperparameters))

Best score on validation set: 0.97312
Best hyperparameters: {'C': 100, 'gamma': 0.01}


In [30]:
from scipy.stats import mode

y_test_hats = []
for baseclf in best_models:
    y_test_hats.append(baseclf.predict(X_test_scaled))

y_test_hat = mode(y_test_hats, axis=0)[0]
test_score = accuracy_score(y_test, y_test_hat)
print('Test set score with best hyperparameters: {:.5f}'.format(test_score))

Test set score with best hyperparameters: 0.97368


In [31]:
classes = best_models[0].classes_

y_test_hats = []
for baseclf in best_models:
    y_test_hats.append(baseclf.decision_function(X_test_scaled))

y_test_hat = classes[np.argmax(np.mean(y_test_hats, axis=0), axis=1)]
test_score = accuracy_score(y_test, y_test_hat)
print('Test set score with best hyperparameters: {:.5f}'.format(test_score))

Test set score with best hyperparameters: 0.97368


In [32]:
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC

iris = load_iris()

In [33]:
hyperparam_grid = [{'kernel': ['rbf'],
                    'C': [0.001, 0.01, 0.1, 1, 10, 100],
                    'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
                   {'kernel': ['linear'],
                    'C': [0.001, 0.01, 0.1, 1, 10, 100]}]

inner_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
outer_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
grid_search = GridSearchCV(SVC(), hyperparam_grid, scoring='accuracy', refit=True, cv=inner_kfold)
scores = cross_validate(grid_search, iris.data, iris.target, scoring='accuracy', cv=outer_kfold,
                        return_train_score=True, return_estimator=True)

print('Outer cross-validation score: %.5f'%scores['test_score'].mean())

Outer cross-validation score: 0.96667


In [34]:
from sklearn.preprocessing import StandardScaler

hyperparam_grid = [{'kernel': ['rbf'],
                    'C': [0.001, 0.01, 0.1, 1, 10, 100],
                    'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
                   {'kernel': ['linear'],
                    'C': [0.001, 0.01, 0.1, 1, 10, 100]}]

inner_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
outer_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

scaler = StandardScaler()
score_test = []
for trainval_idx, test_idx in outer_kfold.split(iris.data, iris.target):
                                       
    X_trainval = iris.data[trainval_idx]
    y_trainval = iris.target[trainval_idx]  
    X_test = iris.data[test_idx]
    y_test = iris.target[test_idx]
    
    scaler.fit(X_trainval)
    X_trainval_scaled = scaler.transform(X_trainval)
    X_test_scaled = scaler.transform(X_test)
    
    grid_search = GridSearchCV(SVC(), hyperparam_grid, scoring='accuracy', refit=True, cv=inner_kfold)
    grid_search.fit(X_trainval_scaled, y_trainval)

    y_test_hat = grid_search.predict(X_test_scaled)
    score_test.append(accuracy_score(y_test, y_test_hat))

print('Outer cross-validation score: %.5f'%np.mean(score_test))

Outer cross-validation score: 0.94667
