# README
- [sklearn.datasets.load_breast_cancer](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer)
 を使い、StratifiedKFold を行う
- EDAは省略


In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

%precision 3
%matplotlib inline

# データの準備

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

X = cancer.data
y = cancer.target

X_trainval, X_test, y_trainval, y_test = train_test_split(X,
                                                          y,
                                                          train_size=0.80,
                                                          random_state=0)

# 交差検証


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [44]:
# StratifiedKFoldの準備
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=0)

## 単純な交差検証その1: `cross_validate()`


In [45]:
cv_logreg = cross_validate(LogisticRegression(),
                           X_trainval,
                           y_trainval,
                           cv=skf,  # int指定も可能。その場合StratifiedKFoldが設定される
                           scoring=['accuracy', 'recall'])

cv_logreg

{'fit_time': array([0.003, 0.003, 0.003, 0.004, 0.004, 0.004, 0.003]),
 'score_time': array([0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]),
 'test_accuracy': array([0.894, 0.939, 0.924, 0.954, 0.969, 0.984, 0.969]),
 'test_recall': array([0.976, 0.929, 0.976, 0.976, 0.976, 0.976, 1.   ])}

In [46]:
cv_dt = cross_validate(DecisionTreeClassifier(),
                       X_trainval,
                       y_trainval,
                       cv=skf,  # int指定も可能。その場合StratifiedKFoldが設定される
                       scoring=['accuracy', 'recall'])

cv_dt

{'fit_time': array([0.006, 0.005, 0.005, 0.004, 0.005, 0.005, 0.005]),
 'score_time': array([0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]),
 'test_accuracy': array([0.909, 0.924, 0.909, 0.908, 0.922, 0.969, 0.922]),
 'test_recall': array([0.976, 0.905, 0.905, 0.927, 0.927, 0.951, 0.927])}

In [47]:
cv_knn = cross_validate(KNeighborsClassifier(),
                        X_trainval,
                        y_trainval,
                        cv=skf,  # int指定も可能。その場合StratifiedKFoldが設定される
                        scoring=['accuracy', 'recall'])

cv_knn

{'fit_time': array([0., 0., 0., 0., 0., 0., 0.]),
 'score_time': array([0.006, 0.005, 0.006, 0.006, 0.005, 0.005, 0.005]),
 'test_accuracy': array([0.894, 0.939, 0.939, 0.938, 0.938, 0.922, 0.906]),
 'test_recall': array([0.976, 0.952, 0.976, 0.951, 0.976, 0.927, 0.927])}

## 単純な交差検証その2(KFoldを利用して自作)

In [48]:
valid_scores = []

for train_index, valid_index in skf.split(X_trainval, y_trainval):
    X_train, X_valid = X_trainval[train_index], X_trainval[valid_index]
    y_train, y_valid = y_trainval[train_index], y_trainval[valid_index]

    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    acc_valid = clf.score(X_valid, y_valid)
    valid_scores.append(acc_valid)


print(valid_scores)
print(f'Mean: {np.mean(valid_scores):.3f} Std: {np.std(valid_scores):.3f}')

[0.8939393939393939, 0.9393939393939394, 0.9242424242424242, 0.9538461538461539, 0.96875, 0.984375, 0.96875]
Mean: 0.948 Std: 0.029


## パラメータチューニングあり交差検証その1(自作)

In [49]:
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
}

best_params = {}
best_mean_acc = 0

all_acc = np.array([], dtype=np.float64)
all_std = np.array([], dtype=np.float64)

for params in ParameterGrid(param_grid):
    valid_scores = []

    for train_index, valid_index in skf.split(X_trainval, y_trainval):
        X_train, X_valid = X_trainval[train_index], X_trainval[valid_index]
        y_train, y_valid = y_trainval[train_index], y_trainval[valid_index]

        # `**` をつけると辞書でそのまま渡せる
        clf = SVC(**params)
        clf.fit(X_train, y_train)

        acc_valid = clf.score(X_valid, y_valid)
        valid_scores.append(acc_valid)

    mean_acc = np.mean(valid_scores)
    mean_std = np.std(valid_scores)
    all_acc = np.append(all_acc, mean_acc)
    all_std = np.append(all_std, mean_std)

    if mean_acc > best_mean_acc:
        best_mean_acc = mean_acc
        best_params = params

print(f'Best params: {best_params}')
print(f'Best acc: {best_mean_acc:.3f}')

Best params: {'C': 1, 'gamma': 0.001}
Best acc: 0.916


In [50]:
# 最も良かったパラメータかつ、すべての学習データを使って学習
retrained_model = SVC(**best_params)
retrained_model.fit(X_trainval, y_trainval)

final_score = retrained_model.score(X_test, y_test)
print(f'Test acc: {final_score:.3f}')

Test acc: 0.921


## パラメータチューニングあり交差検証その2(`GridSearchCV`)

In [51]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
}

gs = GridSearchCV(SVC(),
                  param_grid,
                  cv=skf,
                  scoring='accuracy',
                  n_jobs=-1  # 計算にすべてのコアを使う
                  )

# 交差検証および、チャンピオンパラメータでの再学習までやってくれる
gs.fit(X_trainval, y_trainval)

print(f'Best params: {gs.best_params_}')
print(f'Best acc: {gs.best_score_:.3f}')

Best params: {'C': 1, 'gamma': 0.001}
Best acc: 0.916


In [52]:
final_score = gs.score(X_test, y_test)
print(f'Test acc: {final_score:.3f}')

Test acc: 0.921


In [53]:
# 詳細な結果
gs.cv_results_

{'mean_fit_time': array([0.008, 0.007, 0.007, 0.008, 0.008, 0.01 , 0.014, 0.011, 0.01 ,
        0.009, 0.01 , 0.009, 0.008, 0.011, 0.01 , 0.01 , 0.009, 0.01 ,
        0.009, 0.011, 0.012, 0.009, 0.009, 0.011, 0.01 , 0.012, 0.012,
        0.01 , 0.01 , 0.011, 0.01 , 0.008, 0.009, 0.008, 0.007, 0.008]),
 'std_fit_time': array([0.   , 0.001, 0.001, 0.001, 0.001, 0.002, 0.01 , 0.002, 0.001,
        0.001, 0.002, 0.   , 0.001, 0.002, 0.001, 0.001, 0.   , 0.002,
        0.   , 0.001, 0.003, 0.001, 0.001, 0.002, 0.001, 0.   , 0.001,
        0.001, 0.001, 0.001, 0.002, 0.   , 0.003, 0.002, 0.   , 0.001]),
 'mean_score_time': array([0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.005, 0.002,
        0.001, 0.001, 0.001, 0.002, 0.002, 0.001, 0.001, 0.001, 0.002,
        0.001, 0.002, 0.002, 0.001, 0.002, 0.001, 0.001, 0.002, 0.001,
        0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]),
 'std_score_time': array([6.364e-04, 8.772e-05, 1.022e-04, 4.028e-04, 1.682e-04, 4.323e-04

おわり