### hold-out

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
df = sns.load_dataset('tips')
y_col = 'tip'
X = df.drop(columns=[y_col])
# 標準化のために数値カラムのリストを取得
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()

In [3]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
numeric_cols

['total_bill', 'size']

In [5]:
# one-hot エンコーディング
X = pd.get_dummies(X, drop_first=True)
y = df[y_col]
# テストデータと学習データに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [6]:
# 分割後のデータ数を確認
print(len(X_train))
print(len(X_test))

170
74


In [7]:
# 標準化
# 標準化は，データ分割の後に実施する
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# 数値カラムのみ標準化
X_train_scaled = X_train.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_train_scaled

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
61,-0.682321,-0.616436,0,0,0,1,0,1
146,-0.152575,0.463915,1,1,0,0,0,0
52,1.620922,1.544266,1,1,0,0,1,1
66,-0.392771,-0.616436,1,1,0,1,0,1
26,-0.730580,-0.616436,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...
67,-1.860266,-1.696786,1,0,0,1,0,1
192,0.922272,-0.616436,0,0,0,0,0,0
117,-1.028905,-0.616436,1,1,0,0,0,0
47,1.356598,1.544266,0,1,0,0,1,1


In [8]:
X_test_scaled = X_test.copy()
# trainデータの平均, 分散で標準化する
# trainデータでfit済み. テストデータに対しscaler.transformする
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [9]:
# 線形回帰モデル学習
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [10]:
# モデルの評価 (MSE)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred) # np.mean(np.square(y_test - y_pred))

0.955080898861715

### LOOCV

In [11]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [12]:
# データ準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip']

In [13]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()

In [14]:
list(loo.split(X))[:10]

[(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
          14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
          27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
          40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
          53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
          66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
          79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
          92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
         105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
         118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
         131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
         144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
         157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
         170, 171, 172, 173, 174, 175,

In [15]:
model = LinearRegression()
mse_list = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # モデル学習
    model.fit(X_train, y_train)
    # テストデータの予測
    y_pred = model.predict(X_test)
    # MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [16]:
print(f"MSE(LOOCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

MSE(LOOCV):1.0675673489857438
std:2.0997944551776313


In [17]:
# cross_val_score関数を使えば簡単にCVを実行可能
from sklearn.model_selection import cross_val_score
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
print(f"MSE(LOOCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(LOOCV):1.0675673489857438
std:2.0997944551776313


### k-Fold CV

In [18]:
from sklearn.model_selection import KFold, RepeatedKFold
k = 5
n_repeats = 3
# cv = KFold(n_splits=k, shuffle=True, random_state=0)
cv = RepeatedKFold(n_splits=k, n_repeats=n_repeats, random_state=0)
list(cv.split(X))[:2]

[(array([  0,   1,   2,   3,   4,   6,   7,   9,  10,  11,  13,  14,  16,
          17,  19,  20,  21,  23,  24,  25,  26,  27,  28,  29,  30,  31,
          32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  43,  46,  47,
          48,  49,  50,  51,  52,  53,  54,  56,  57,  58,  59,  60,  61,
          62,  65,  66,  67,  68,  69,  70,  72,  75,  77,  78,  79,  80,
          81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  93,  94,
          95,  96,  97,  98,  99, 100, 101, 102, 103, 105, 106, 109, 112,
         113, 114, 115, 116, 117, 119, 120, 121, 122, 123, 126, 127, 128,
         129, 130, 131, 132, 133, 134, 135, 137, 139, 140, 141, 142, 143,
         144, 146, 147, 148, 149, 151, 152, 153, 154, 156, 157, 159, 160,
         161, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172, 173, 174,
         175, 176, 177, 178, 179, 182, 183, 184, 185, 186, 187, 190, 191,
         192, 193, 194, 195, 196, 197, 200, 201, 202, 204, 205, 206, 207,
         208, 211, 213, 214, 215, 216,

In [19]:
model = LinearRegression()
mse_list = []
for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # モデル学習
    model.fit(X_train, y_train)
    # テストデータの予測
    y_pred = model.predict(X_test)
    # MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [20]:
mse_list

[0.8213090642766285,
 1.0745842125927976,
 1.0880123892600384,
 1.3323867714930204,
 1.084763004349474,
 1.158783913113142,
 1.6042084002514578,
 1.0307086207441924,
 0.7120290668798743,
 0.8472985410140897,
 0.8856103319481908,
 1.5248521639391936,
 0.6332659028150582,
 1.200354200262607,
 1.121414266809207]

In [21]:
print(f"MSE({k}FoldCV): {np.mean(mse_list)}")
print(f"std: {np.std(mse_list)}")

MSE(5FoldCV): 1.0746387233165982
std: 0.2651717854089844


In [22]:
# 同様にcross_val_scoreで簡単に実施可能
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
print(f"MSE({k}FoldCV): {-np.mean(scores)}")
print(f"std: {np.std(scores)}")

MSE(5FoldCV): 1.0746387233165982
std: 0.2651717854089844


## Pipeline

### Pipeline + KFold

In [23]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])
pipeline

In [24]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])

In [25]:
# Pipelineなし
# 標準化 + 線形回帰
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [26]:
## Pipelineあり
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])
pipeline.fit(X_train, y_train)
y_pred_p = pipeline.predict(X_test)

In [27]:
# どちらも同じ結果になる
(y_pred_p == y_pred).all()

True