# 汎化性能の検証
## k-hold CV

In [2]:
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# seabornが用意してくれている練習用データセットを利用
# https://github.com/mwaskom/seaborn-data
df = sns.load_dataset("tips")

# 目的変数をtipとする(他のデータからtipを予測するモデルを構築する)
y_col = "tip"
y = df[y_col]

# total_billからtipを予測する
X = df["total_bill"].values.reshape(-1,1) #reshapeで二次元配列に変換

In [5]:
# データを5つに分割して検証するサイクル(交差検証)を回す
k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=0)

In [6]:
model = LinearRegression()
# それぞれのサイクルの予測の精度結果を保持しておく
mse_list = []
for train_index, test_index in cv.split(X):
    # loo.split()によって生成されたtrainとtestのindexを元にそのサイクルのデータを作る
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 特徴量が一つしかないので、標準化はしない
    
    # モデル学習
    model.fit(X_train, y_train)
    # テストデータの予測
    y_pred = model.predict(X_test)
    # MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [9]:
print(mse_list) # k=5であるので5つのMSEが得られる
print(f"MSE({k}-FoldCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

[0.8213090642766285, 1.0745842125927976, 1.0880123892600388, 1.3323867714930204, 1.084763004349474]
MSE(5-FoldCV):1.080211088394392
std:0.1617010050703952


上のfor分の処理は、以下のようにcross_val_scoreを用いることで自分で実装しなくても行える

In [10]:
from sklearn.model_selection import cross_val_score
k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=0)
scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")

print(f"MSE({k}-FoldCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(5-FoldCV):1.080211088394392
std:0.1617010050703952


### repeated_k_fold_CV
k-fold CVを複数回分割し、分割による偏りをなくす

In [13]:
from sklearn.model_selection import RepeatedKFold

k = 5
n_repeats = 3
cv = RepeatedKFold(n_splits=k, n_repeats=n_repeats, random_state=0)

model = LinearRegression()
# それぞれのサイクルの予測の精度結果を保持しておく
mse_list = []
for train_index, test_index in cv.split(X):
    # loo.split()によって生成されたtrainとtestのindexを元にそのサイクルのデータを作る
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 特徴量が一つしかないので、標準化はしない
    
    # モデル学習
    model.fit(X_train, y_train)
    # テストデータの予測
    y_pred = model.predict(X_test)
    # MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
    
print(mse_list) # 5*3 = 15個のMSEが得られる
print(f"MSE({n_repeats} repeated {k}-FoldCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

[0.8213090642766285, 1.0745842125927976, 1.0880123892600388, 1.3323867714930204, 1.084763004349474, 1.1587839131131425, 1.6042084002514578, 1.0307086207441927, 0.7120290668798744, 0.8472985410140895, 0.8856103319481907, 1.5248521639391936, 0.6332659028150582, 1.2003542002626073, 1.121414266809207]
MSE(3 repeated 5-FoldCV):1.0746387233165984
std:0.26517178540898434


In [15]:
# cross_val_scoreを用いて、上で実装した処理を1行で実行する
scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")

print(f"MSE({n_repeats} repeated {k}-FoldCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(3 repeated 5-FoldCV):1.0746387233165984
std:0.26517178540898434


## 標準化をk-Fold CVに組み込む
**Pipeline**オブジェクトを使うことで、 
- 標準化
- モデル学習
- 評価 

などの複数の処理をまとめることが可能

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline(steps=[("scaler", StandardScaler()),("model", LinearRegression())])
pipeline

In [23]:
k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=0)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring="neg_mean_squared_error")
scores

print(f"MSE({k}-FoldCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(5-FoldCV):1.0802110883943916
std:0.1617010050703952


参考として、シンプルなhold out法による、pipelineありorなしの処理の比較

In [20]:
# pipelineなしの標準化 + 線形回帰

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

array([2.71486884, 2.78639251, 2.90900452, 1.65836207, 2.57999564,
       1.50509707, 2.74858715, 3.30136293, 2.77208778, 4.45800284,
       3.50060744, 3.49345507, 2.35520697, 2.24587793, 2.28879213,
       4.02375199, 1.77075641, 2.3480546 , 2.83645908, 3.2778623 ,
       3.98901192, 3.05511716, 2.55240794, 2.45431834, 2.29798803,
       2.59327861, 2.16004953, 3.96244599, 3.50162921, 2.5289073 ,
       2.42264357, 2.19274606, 2.49314547, 1.99963215, 2.78639251,
       2.28572683, 2.64743224, 1.97306622, 5.85577969, 2.55036441,
       1.79425705, 2.18763723, 2.52073317, 3.96755482, 2.22135553,
       2.65151931, 2.78128368, 3.12255376, 2.66173698, 3.66409011,
       4.2567148 , 2.74552185, 3.01118119, 5.83943142, 1.89847725,
       2.14676656, 3.97572896, 3.03161652, 2.37462053, 2.21113786,
       3.70496078, 2.53299437, 3.07963956, 3.47199797, 3.99718606,
       2.5043849 , 2.60043097, 4.2720413 , 1.97306622, 3.87763935,
       2.4890584 , 1.99145802, 3.43010554, 2.37972937])

In [24]:
# pipelineあり
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
pipeline = Pipeline(steps=[("scaler", StandardScaler()),("model", LinearRegression())])
# このfitメソッドは、scalerのfitと、modelのfitが順番に実行されているイメージ
pipeline.fit(X_train, y_train)
y_pred_p = pipeline.predict(X_test)
y_pred_p

array([2.71486884, 2.78639251, 2.90900452, 1.65836207, 2.57999564,
       1.50509707, 2.74858715, 3.30136293, 2.77208778, 4.45800284,
       3.50060744, 3.49345507, 2.35520697, 2.24587793, 2.28879213,
       4.02375199, 1.77075641, 2.3480546 , 2.83645908, 3.2778623 ,
       3.98901192, 3.05511716, 2.55240794, 2.45431834, 2.29798803,
       2.59327861, 2.16004953, 3.96244599, 3.50162921, 2.5289073 ,
       2.42264357, 2.19274606, 2.49314547, 1.99963215, 2.78639251,
       2.28572683, 2.64743224, 1.97306622, 5.85577969, 2.55036441,
       1.79425705, 2.18763723, 2.52073317, 3.96755482, 2.22135553,
       2.65151931, 2.78128368, 3.12255376, 2.66173698, 3.66409011,
       4.2567148 , 2.74552185, 3.01118119, 5.83943142, 1.89847725,
       2.14676656, 3.97572896, 3.03161652, 2.37462053, 2.21113786,
       3.70496078, 2.53299437, 3.07963956, 3.47199797, 3.99718606,
       2.5043849 , 2.60043097, 4.2720413 , 1.97306622, 3.87763935,
       2.4890584 , 1.99145802, 3.43010554, 2.37972937])