### k-fold

In [1]:
# import package
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np
from sklearn.datasets import load_iris

# input data, ML model
iris = load_iris()
X = iris.data
y = iris.target
dt_clf = DecisionTreeClassifier(random_state=12345)

# 5-fold CV
# shuffle 하면 데이터가 섞인다. -> y와 대응되는 값을 찾기 어려울 수 있음.
kfold = KFold(n_splits=5, shuffle=False)
cv_index = kfold.split(X) # X 5단계로 분할.

cv_accuracy = []
ind = 1

for train_index, test_index in cv_index: # 순서대로 학습 시키고 정확도 계산 후 출력
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    dt_clf.fit(X_train , y_train)
    accuracy = dt_clf.score(X_test, y_test)

    print(f'{ind}번째 Cross Validation 정확도: {accuracy:.2%}')
    cv_accuracy.append(accuracy)

    ind += 1

print(f'''-------------------------------------------
Cross Validation 정확도 평균: {np.mean(cv_accuracy):.2%}''')

1번째 Cross Validation 정확도: 100.00%
2번째 Cross Validation 정확도: 100.00%
3번째 Cross Validation 정확도: 83.33%
4번째 Cross Validation 정확도: 93.33%
5번째 Cross Validation 정확도: 73.33%
-------------------------------------------
Cross Validation 정확도 평균: 90.00%


### stratified K-fold

In [2]:
# 일반 K_fold와 stratified k_fold 비교

import pandas as pd
from sklearn.model_selection import StratifiedKFold

# imput data
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target

# 3-fold CV
kfold = KFold(n_splits=3, shuffle=False)

ind = 1

for train_index, test_index in kfold.split(X):
    train_label = iris_df.loc[train_index, 'species']
    test_label = iris_df.loc[test_index, 'species']
    print(
f'''\033[1m[{ind}번째 3-Fold Cross Validation]\033[0m
* training set 데이터 분포
{train_label.value_counts()}
* test set 데이터 분포
{test_label.value_counts()}'''
    )
    ind += 1

print('-'*50)


# Stratified 3-fold CV
skfold = StratifiedKFold(n_splits=3)

ind = 1

for train_index, test_index in skfold.split(X, y):
    train_label = iris_df.loc[train_index, 'species']
    test_label = iris_df.loc[test_index, 'species']
    print(
f'''\033[1m[{ind}번째 Stratified 3-Fold Cross Validation]\033[0m
* training set 데이터 분포
{train_label.value_counts()}
* test set 데이터 분포
{test_label.value_counts()}'''
    )
    ind += 1

[1m[1번째 3-Fold Cross Validation][0m
* training set 데이터 분포
species
1    50
2    50
Name: count, dtype: int64
* test set 데이터 분포
species
0    50
Name: count, dtype: int64
[1m[2번째 3-Fold Cross Validation][0m
* training set 데이터 분포
species
0    50
2    50
Name: count, dtype: int64
* test set 데이터 분포
species
1    50
Name: count, dtype: int64
[1m[3번째 3-Fold Cross Validation][0m
* training set 데이터 분포
species
0    50
1    50
Name: count, dtype: int64
* test set 데이터 분포
species
2    50
Name: count, dtype: int64
--------------------------------------------------
[1m[1번째 Stratified 3-Fold Cross Validation][0m
* training set 데이터 분포
species
2    34
0    33
1    33
Name: count, dtype: int64
* test set 데이터 분포
species
0    17
1    17
2    16
Name: count, dtype: int64
[1m[2번째 Stratified 3-Fold Cross Validation][0m
* training set 데이터 분포
species
1    34
0    33
2    33
Name: count, dtype: int64
* test set 데이터 분포
species
0    17
2    17
1    16
Name: count, dtype: int64
[1m[3번째 Stratified 3-Fold Cr

### 데이터가 비교적 균일하게 들어가는 것을 볼 수 있다

In [3]:
dt_clf = DecisionTreeClassifier(random_state=12345)
skfold = StratifiedKFold(n_splits=3)
ind=1
cv_accuracy=[]

# Stratified 5-fold CV
skfold = StratifiedKFold(n_splits=5, shuffle=False)
cv_index = skfold.split(X, y)

cv_accuracy = []
ind = 1

for train_index, test_index in cv_index:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    dt_clf.fit(X_train , y_train)
    accuracy = dt_clf.score(X_test, y_test)

    print(f'{ind}번째 Cross Validation 정확도: {accuracy:.2%}')
    cv_accuracy.append(accuracy)

    ind += 1

print(f'''-------------------------------------------
Cross Validation 정확도 평균: {np.mean(cv_accuracy):.2%}''')

1번째 Cross Validation 정확도: 96.67%
2번째 Cross Validation 정확도: 96.67%
3번째 Cross Validation 정확도: 90.00%
4번째 Cross Validation 정확도: 96.67%
5번째 Cross Validation 정확도: 100.00%
-------------------------------------------
Cross Validation 정확도 평균: 96.00%


### 데이터 쏠림이 없으므로 정확도 평균도 더 높게 나오며, 정확도가 일정하다.

cross_val_score 를 사용하면 for문을 돌리지 않아도 결과를 출력할 수 있다.