In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from scipy.stats import uniform, randint

In [2]:
# 데이터셋
wine = pd.read_csv('https://bit.ly/wine_csv_data')

# input & target
data = wine[['alcohol', 'sugar', 'pH']]
target = wine['class']

# 훈련 세트 & 테스트 세트(20%)
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

# 훈련 세트 >> 훈련 세트 & 검증 세트(20%)
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)

print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [3]:
# 결정트리 훈련
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

# 정확도
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [4]:
# 5-폴드 교차검증 정확도
scores = cross_validate(dt, train_input, train_target)
print(scores)

# 5-폴드 검증 정확도 평균
print(np.mean(scores['test_score']))

{'fit_time': array([0.01369357, 0.00920892, 0.00658846, 0.00493431, 0.00462294]), 'score_time': array([0.00320411, 0.00208807, 0.00105286, 0.0011251 , 0.00100279]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}
0.855300214703487


In [5]:
# 각 폴드 섞어서 다시 확인
scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [6]:
# 10-폴드 교차검증 정확도 평균
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


In [None]:
# 탐색할 매개변수를 리스트 >> 딕셔너리
params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

# 그리드 서치
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

# 그리드 서치의 하이퍼파라미터로 새로 훈련
gs.fit(train_input, train_target)

dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [11]:
# 그리드 서치 결과의 하이퍼파라미터
print(gs.best_params_)

# 그리드 서치 평균 점수
print(gs.cv_results_['mean_test_score'])

# 그리드 서치 최적 하이퍼파라미터
print(gs.cv_results_['params'][gs.best_index_])

{'min_impurity_decrease': 0.0001}
[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]
{'min_impurity_decrease': 0.0001}


In [None]:
# 9(min_impurity_decrease) * 15(max_depth) * 10(min_samples_split) * 5(5-fold validation set)
# 6750회 교차검증
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)}

# 그리드 서치
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

# 그 중 최적의 하이퍼파라미터
print(gs.best_params_)

# 해당 교차검증 정확도
print(np.max(gs.cv_results_['mean_test_score']))

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}
0.8683865773302731


In [19]:
# 정수값 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([4, 9, 0, 1, 1, 7, 2, 2, 4, 6])

In [20]:
# 실수값 샘플링
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.83211161, 0.1849398 , 0.199189  , 0.38806451, 0.45587391,
       0.88244616, 0.22494827, 0.52197575, 0.00683399, 0.51414003])

In [31]:
# 실수값 복원 샘플링
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 89, 105,  94, 107,  91, 102,  94, 115, 102, 101]))

In [None]:
# 샘플링을 통한 랜덤 서치
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),}

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

# 결과 확인
print(rs.best_params_)

# 검증 점수
print(np.max(rs.cv_results_['mean_test_score']))

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}
0.8695428296438884


In [35]:
# 테스트 세트 확인
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [None]:
# ===확인문제===

# 랜덤교차검증(가지 랜덤, 무작위 100회)
gs = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

# 최적 하이퍼파라미터 확인
print(gs.best_params_)
# 검증 최대점수
print(np.max(gs.cv_results_['mean_test_score']))

# 최대치 테스트 정확도
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077
