# README
- [sklearn.datasets.load_breast_cancer](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer)
 を使い、交差検証を行う
- EDAは省略


In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

%precision 3
%matplotlib inline

# データの準備

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

X = cancer.data
y = cancer.target

X_trainval, X_test, y_trainval, y_test = train_test_split(X,
                                                          y,
                                                          train_size=0.80,
                                                          random_state=0)

# 交差検証


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [71]:
# Foldの準備
kfold = KFold(n_splits=7, shuffle=True, random_state=0)

## 単純な交差検証その1: `cross_validate()`


In [72]:
cv_logreg = cross_validate(LogisticRegression(),
                           X_trainval,
                           y_trainval,
                           cv=kfold,  # int指定も可能。その場合StratifiedKFoldが設定される
                           scoring=['accuracy', 'recall'])

cv_logreg

{'fit_time': array([0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003]),
 'score_time': array([0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]),
 'test_accuracy': array([0.923, 0.954, 0.985, 0.938, 0.969, 0.985, 0.923]),
 'test_recall': array([0.905, 0.974, 1.   , 1.   , 0.974, 1.   , 0.947])}

In [73]:
cv_dt = cross_validate(DecisionTreeClassifier(),
                       X_trainval,
                       y_trainval,
                       cv=kfold,  # int指定も可能。その場合StratifiedKFoldが設定される
                       scoring=['accuracy', 'recall'])

cv_dt

{'fit_time': array([0.004, 0.003, 0.004, 0.003, 0.004, 0.004, 0.004]),
 'score_time': array([0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]),
 'test_accuracy': array([0.862, 0.892, 0.892, 0.969, 0.877, 0.892, 0.908]),
 'test_recall': array([0.833, 0.895, 0.867, 0.978, 0.872, 0.905, 0.947])}

In [74]:
cv_knn = cross_validate(KNeighborsClassifier(),
                        X_trainval,
                        y_trainval,
                        cv=kfold,  # int指定も可能。その場合StratifiedKFoldが設定される
                        scoring=['accuracy', 'recall'])

cv_knn

{'fit_time': array([0.001, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]),
 'score_time': array([0.006, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005]),
 'test_accuracy': array([0.954, 0.892, 0.923, 0.923, 0.954, 0.908, 0.923]),
 'test_recall': array([0.952, 0.895, 0.956, 0.978, 0.974, 0.976, 0.921])}

## 単純な交差検証その2(KFoldを利用して自作)

In [75]:
valid_scores = []

for train_index, valid_index in kfold.split(X_trainval, y_trainval):
    X_train, X_valid = X_trainval[train_index], X_trainval[valid_index]
    y_train, y_valid = y_trainval[train_index], y_trainval[valid_index]

    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    acc_valid = clf.score(X_valid, y_valid)
    valid_scores.append(acc_valid)


print(valid_scores)
print(f'Mean: {np.mean(valid_scores):.3f} Std: {np.std(valid_scores):.3f}')

[0.9230769230769231, 0.9538461538461539, 0.9846153846153847, 0.9384615384615385, 0.9692307692307692, 0.9846153846153847, 0.9230769230769231]
Mean: 0.954 Std: 0.025


## パラメータチューニングあり交差検証その1(自作)

In [76]:
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
}

best_params = {}
best_mean_acc = 0

all_acc = np.array([], dtype=np.float64)
all_std = np.array([], dtype=np.float64)

for params in ParameterGrid(param_grid):
    valid_scores = []

    for train_index, valid_index in kfold.split(X_trainval, y_trainval):
        X_train, X_valid = X_trainval[train_index], X_trainval[valid_index]
        y_train, y_valid = y_trainval[train_index], y_trainval[valid_index]

        # `**` をつけると辞書でそのまま渡せる
        clf = SVC(**params)
        clf.fit(X_train, y_train)

        acc_valid = clf.score(X_valid, y_valid)
        valid_scores.append(acc_valid)

    mean_acc = np.mean(valid_scores)
    mean_std = np.std(valid_scores)
    all_acc = np.append(all_acc, mean_acc)
    all_std = np.append(all_std, mean_std)

    if mean_acc > best_mean_acc:
        best_mean_acc = mean_acc
        best_params = params

print(f'Best params: {best_params}')
print(f'Best acc: {best_mean_acc:.3f}')

Best params: {'C': 1, 'gamma': 0.001}
Best acc: 0.916


In [77]:
# 最も良かったパラメータかつ、すべての学習データを使って学習
retrained_model = SVC(**best_params)
retrained_model.fit(X_trainval, y_trainval)

final_score = retrained_model.score(X_test, y_test)
print(f'Test acc: {final_score:.3f}')

Test acc: 0.921


## パラメータチューニングあり交差検証その2(`GridSearchCV`)

In [78]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
}

gs = GridSearchCV(SVC(),
                  param_grid,
                  cv=kfold,
                  scoring='accuracy',
                  n_jobs=-1 # 計算にすべてのコアを使う
                  )

# 交差検証および、チャンピオンパラメータでの再学習までやってくれる
gs.fit(X_trainval, y_trainval)

print(f'Best params: {gs.best_params_}')
print(f'Best acc: {gs.best_score_:.3f}')

Best params: {'C': 1, 'gamma': 0.001}
Best acc: 0.916


In [79]:
final_score = gs.score(X_test, y_test)
print(f'Test acc: {final_score:.3f}')

Test acc: 0.921


In [80]:
# 詳細な結果
gs.cv_results_

{'mean_fit_time': array([0.008, 0.007, 0.006, 0.008, 0.008, 0.007, 0.008, 0.008, 0.007,
        0.006, 0.007, 0.008, 0.007, 0.009, 0.01 , 0.009, 0.008, 0.009,
        0.009, 0.01 , 0.01 , 0.009, 0.009, 0.009, 0.01 , 0.011, 0.01 ,
        0.011, 0.011, 0.01 , 0.008, 0.011, 0.011, 0.008, 0.011, 0.01 ]),
 'std_fit_time': array([0.   , 0.001, 0.   , 0.001, 0.   , 0.   , 0.001, 0.001, 0.001,
        0.   , 0.001, 0.   , 0.   , 0.001, 0.001, 0.   , 0.001, 0.001,
        0.   , 0.001, 0.001, 0.   , 0.001, 0.002, 0.001, 0.   , 0.001,
        0.   , 0.001, 0.002, 0.001, 0.   , 0.001, 0.002, 0.001, 0.001]),
 'mean_score_time': array([0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
        0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
        0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
        0.001, 0.001, 0.001, 0.001, 0.002, 0.001, 0.001, 0.001, 0.001]),
 'std_score_time': array([2.565e-05, 1.654e-04, 3.050e-05, 2.585e-05, 1.435e-04, 1.537e-04

おわり