# <1. 교차검증(Cross Validation)>

- 과적합 : 모델이 학습 데이터에만 과도하게 최적화되어 일반화된 데이터에서는 예측 성능이 과하게 떨어지는 현상
- 나에게 주어진 데이터에 적용한 모델의 성능을 정확하게 표현하는데 유용한 방법

### 1. 예시

In [15]:
import numpy as np
from sklearn.model_selection import KFold

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])

print(X)
print()
print(y)

[[1 2]
 [3 4]
 [5 6]
 [7 8]]

[1 2 3 4]


In [12]:
kf = KFold(n_splits=2)  # 데이터를 2등분

print(kf.get_n_splits(X))
print(kf)

2
KFold(n_splits=2, random_state=None, shuffle=False)


In [16]:
# kflod는 인덱스를 반환한다

for train_idx, test_idx in kf.split(X):  
    print('***** idx *****')
    print('train_idx : ', train_idx)
    print('test_idx : ', test_idx)
    print('--- train data')
    print(X[train_idx])
    print('--- test data')
    print(X[test_idx])
    print()

***** idx *****
train_idx :  [2 3]
test_idx :  [0 1]
--- train data
[[5 6]
 [7 8]]
--- test data
[[1 2]
 [3 4]]

***** idx *****
train_idx :  [0 1]
test_idx :  [2 3]
--- train data
[[1 2]
 [3 4]]
--- test data
[[5 6]
 [7 8]]



### 2. 와인 데이터에 적용
- 와인 맛 분류 

In [24]:
# 와인 데이터 사용
import pandas as pd

red_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial"+"/master/dataset/winequality-red.csv"
white_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial"+"/master/dataset/winequality-white.csv"


red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')



# 데이터 합치기
red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])

In [25]:
wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [26]:
# 일반적인 학슴

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print("Train Acc : ", accuracy_score(y_train, y_pred_tr))
print("Test Acc : ", accuracy_score(y_test, y_pred_test))

Train Acc :  0.7294593034442948
Test Acc :  0.7161538461538461


- 이때, "데이터를 저렇게 분리하는게 최선인가? 저 acc를 어떻게 신뢰할수 있는가?" 라고 누가 묻는다면...

In [27]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

In [29]:
# kflod는 인덱스를 반환

for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


In [30]:
# kflod 학습
# 각각의 fold에 대한 학습 후 acc

cv_acc = []

for tr_idx, te_idx in kfold.split(X):
    X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
    y_train, y_test = y.iloc[tr_idx], y.iloc[te_idx]
    
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_acc.append(accuracy_score(y_test, pred))
    
cv_acc

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

In [31]:
# 평균값

np.mean(cv_acc)

0.709578255462782

- 교차검증 결과, 70% acc 가 나왔다.

### 3. Stratified KFold

In [33]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_acc_s = []

for tr_idx, te_idx in skfold.split(X, y):
    X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
    y_train, y_test = y.iloc[tr_idx], y.iloc[te_idx]
    
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_acc_s.append(accuracy_score(y_test, pred))
    
cv_acc_s

[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]

In [35]:
# 평균값

np.mean(cv_acc_s)

0.6888004974240539

- 평균값이 더 낮아졌다.

### 4. 간단한 방법으로 교차검증 하기

In [36]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [37]:
# max_depth를 5로 변경

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=13)

cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)

array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772])

- depth가 높다고 무조건 acc가 좋아지는 것도 아니다!

### 5. train score와 함께 보기

In [38]:
from sklearn.model_selection import cross_validate

cross_validate(wine_tree_cv, X, y, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.03091502, 0.04089475, 0.03291178, 0.03390861, 0.03491116]),
 'score_time': array([0.00299954, 0.00896478, 0.00399208, 0.00498843, 0.00398445]),
 'test_score': array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772]),
 'train_score': array([0.78795459, 0.78045026, 0.77568295, 0.76356291, 0.76279338])}

- 과적합 문제도 함께 나타나고 있다.
- 이것만 가지고 모델의 좋고나쁨을 판단할수는 없다.

----
# <2. 하이퍼파마리터 튜닝>
* 모델의 성능을 확보하기 위해 조절하는 설정 값 (ex. max_depth)

In [39]:
# 와인 데이터 사용
import pandas as pd

red_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial"+"/master/dataset/winequality-red.csv"
white_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial"+"/master/dataset/winequality-white.csv"


red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')



# 데이터 합치기
red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])

wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

### 1. GridSearch CV
- cv : cross validation
- 수정하고 싶은 파라미터를 dic 으로 정의하면 된다.

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth' : [2, 4, 7, 10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)

gs = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
gs.fit(X, y)

GridSearchCV(cv=5,
             estimator=DecisionTreeClassifier(max_depth=2, random_state=13),
             param_grid={'max_depth': [2, 4, 7, 10]})

- estimator : 사용하는 모델
- param_grid : 수정하는 파라미터
- cv : 5겹

- fit 시킬 때, tain-test split을 하지 않아도 GridSearchCV가 알아서 해줌. 개꿀

In [42]:
# 결과 확인하기
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gs.cv_results_)

{   'mean_fit_time': array([0.01885824, 0.0216455 , 0.03071671, 0.04072104]),
    'mean_score_time': array([0.00418229, 0.00399008, 0.00279326, 0.00300007]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.65340854, 0.64401587]),
    'param_max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'max_depth': 2},
                  {'max_depth': 4},
                  {'max_depth': 7},
                  {'max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4]),
    'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60307692, 0.60076923]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68360277, 0.66743649]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
    'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.72517321]),
    'std

- 'rank_test_score': array([1, 2, 3, 4]) : 2, 4, 7, 10 순서대로 1, 2, 3, 4 등

In [49]:
# 최적의 성능을 가진 모델 찾기

print("gs.best_estimator_ : ", gs.best_estimator_)
print("gs.best_score_     : ", gs.best_score_)
print("gs.best_params_    : ", gs.best_params_)
print("gs.best_index_     : ", gs.best_index_)

gs.best_estimator_ :  DecisionTreeClassifier(max_depth=2, random_state=13)
gs.best_score_     :  0.6888004974240539
gs.best_params_    :  {'max_depth': 2}
gs.best_index_     :  0


### 2. Pipeline에 적용

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

estimators = [('scaler', StandardScaler()),
             ('clf', DecisionTreeClassifier(random_state=13))]

pipe = Pipeline(estimators)

In [51]:
param_pipe = [{'clf__max_depth' : [2, 4, 7, 10]}]   # 언더바 2개 잊지말자

gs_p = GridSearchCV(estimator=pipe, param_grid=param_pipe, cv=5)
gs_p.fit(X,y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('clf',
                                        DecisionTreeClassifier(random_state=13))]),
             param_grid=[{'clf__max_depth': [2, 4, 7, 10]}])

In [52]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gs_p.cv_results_)

{   'mean_fit_time': array([0.03570304, 0.02752566, 0.03311954, 0.04388156]),
    'mean_score_time': array([0.00877643, 0.00359135, 0.00298481, 0.00379219]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.6534083 , 0.64401563]),
    'param_clf__max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'clf__max_depth': 2},
                  {'clf__max_depth': 4},
                  {'clf__max_depth': 7},
                  {'clf__max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4]),
    'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60461538, 0.60230769]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68206313, 0.66589684]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
    'split4_test_score': array([0.75673595, 0.7182448 , 0.7351809

In [53]:
print("gs_p.best_estimator_ : ", gs_p.best_estimator_)
print("gs_p.best_score_     : ", gs_p.best_score_)
print("gs_p.best_params_    : ", gs_p.best_params_)
print("gs_p.best_index_     : ", gs_p.best_index_)

gs_p.best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', DecisionTreeClassifier(max_depth=2, random_state=13))])
gs_p.best_score_     :  0.6888004974240539
gs_p.best_params_    :  {'clf__max_depth': 2}
gs_p.best_index_     :  0


### 3. 결과를 표로 정리하기

In [54]:
import pandas as pd

score_df = pd.DataFrame(gs_p.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,{'clf__max_depth': 2},1,0.6888,0.071799
1,{'clf__max_depth': 4},2,0.663565,0.083905
2,{'clf__max_depth': 7},3,0.653408,0.086993
3,{'clf__max_depth': 10},4,0.644016,0.076915
