### train_test_split()

In [1]:
import sklearn
sklearn.__version__

'1.2.1'

In [6]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# data load, train & test split
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=121)

# model 생성 - DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()

# train & predict
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)

# accuracy
print("predict accuracy : {0:.3f}".format(accuracy_score(y_test, pred)))


predict accuracy : 0.956


#### 넘파이 ndarray 뿐만 아니라 판다스 DataFrame/Series도 train_test_split( )으로 분할 가능

In [15]:
import pandas as pd

iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

y = iris_df.iloc[:, -1]
X = iris_df.iloc[:, :-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=121)

# model 생성
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)

# scoring 
print('predict accuracy: {0:.3f}'.format(accuracy_score(y_test, pred)))

predict accuracy: 0.956


### cross validation

In [2]:
from sklearn.model_selection import KFold
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
iris_data = iris.data
iris_target = iris.target

dt_clf = DecisionTreeClassifier(random_state=256)

kf = KFold(n_splits=5)

In [5]:
# kf.split 자세히 보기 5개의 폴드로 나누고 그 인덱스를 담은 것
for i, j in kf.split(iris_data):
    print(i, j)

[ 30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 1

In [9]:
import numpy as np
cv_accuracys = []

for train_feature, val_feature in kf.split(iris_data):
    X_train, X_val = iris_data[train_feature], iris_data[val_feature]
    y_train, y_val = iris_target[train_feature], iris_target[val_feature]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_val)
    accuracy = np.round(accuracy_score(y_val, pred), 4)

    cv_accuracys.append(accuracy)

print("Each accuracy", cv_accuracys)
print("Average accuracy :{0:.3f}".format(np.mean(cv_accuracys)))

Each accuracy [1.0, 1.0, 0.8333, 0.9333, 0.7667]
Average accuracy :0.907


### Stratify fold

In [21]:
# stratity fold와 KFold 비교하기
import pandas as pd
kf = KFold(n_splits=3)

iris_df = pd.DataFrame(iris_data, columns=iris.feature_names)
iris_df["target"] = iris_target

for iter, (train_idx, test_idx) in enumerate(kf.split(iris_df)):
    label_train, label_test = iris_df['target'].iloc[train_idx], iris_df['target'].iloc[test_idx]
    print("{}번째 fold".format(iter))
    print(label_train.value_counts())
    print(label_test.value_counts())

0번째 fold
1    50
2    50
Name: target, dtype: int64
0    50
Name: target, dtype: int64
1번째 fold
0    50
2    50
Name: target, dtype: int64
1    50
Name: target, dtype: int64
2번째 fold
0    50
1    50
Name: target, dtype: int64
2    50
Name: target, dtype: int64


In [24]:
from sklearn.model_selection import StratifiedKFold
str_fold = StratifiedKFold(n_splits=3)

for iter, (train_idx, test_idx) in enumerate(str_fold.split(iris_df, iris_df['target'])):
    label_train, label_test = iris_df['target'].iloc[train_idx], iris_df['target'].iloc[test_idx]
    print("{}번째 fold".format(iter))
    print(label_train.value_counts())
    print(label_test.value_counts())

0번째 fold
2    34
0    33
1    33
Name: target, dtype: int64
0    17
1    17
2    16
Name: target, dtype: int64
1번째 fold
1    34
0    33
2    33
Name: target, dtype: int64
0    17
2    17
1    16
Name: target, dtype: int64
2번째 fold
0    34
1    33
2    33
Name: target, dtype: int64
1    17
2    17
0    16
Name: target, dtype: int64
