# 管線測試

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_wine

In [2]:
# Kfold 不可使用 DataFrame
X, y = load_wine(return_X_y=True) # 直接回傳 numpy array
# wine = load_wine(as_frame=True)   # 回傳 DataFrame
# print(wine.DESCR)
# print(wine.data.head())
# print(wine.target.head())

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 20%當測試資料

In [4]:
# 管線定義
pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(random_state=1))

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
    
kfold = StratifiedKFold(n_splits=5).split(X_train, y_train) # 分層抽樣,每一折的類別比例和整體相同,X_train 是訓練資料, y_train是訓練資料的標籤
# n_splits=5 # 5-fold cross validation, 拆成5份,跑5次
# kfold = KFold(n_splits=5).split(X_train, y_train) # 一般 K-fold
# kfold = KFold(n_splits=5,shuffle=True,random_state=0).split(X_train, y_train) # 洗牌後再分
# kfold = KFold(n_splits=5,shuffle=False).split(X_train, y_train) # 不洗牌
# print(kfold)
# print(type(kfold)) # <class 'generator'>
# split(X_train, y_train) # 產生器 generator,每次呼叫會回傳一組 train/test index

scores = [] # 每一折的準確率
for k, (train, test) in enumerate(kfold): # k 是第幾折, train是訓練資料的index, test是測試資料的index
    print(train.shape, test.shape)
    pipe_lr.fit(X_train[train], y_train[train]) # X_train[train]是訓練資料, y_train[train]是訓練資料的標籤
    score = pipe_lr.score(X_train[test], y_train[test]) # 準確率 accuracy,X_train[test]是測試資料, y_train[test]是測試資料的標籤
    scores.append(score)
    
    # np.bincount：統計每一折的 y value_counts() 
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    # Acc: 準確率 accuracy

print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) #CV: cross validation

(113,) (29,)
Fold:  1, Class dist.: [34 47 32], Acc: 0.966
(113,) (29,)


Fold:  2, Class dist.: [34 47 32], Acc: 0.966
(114,) (28,)
Fold:  3, Class dist.: [35 47 32], Acc: 0.964
(114,) (28,)
Fold:  4, Class dist.: [35 47 32], Acc: 0.964
(114,) (28,)
Fold:  5, Class dist.: [34 48 32], Acc: 0.929

CV accuracy: 0.958 +/- 0.015


std 標準差 ==>代表穩定性

std 愈大,表示資料起伏愈大