# 교차 검증과 그리드 서치

In [28]:
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')

data = wine[['alcohol', 'sugar', 'pH']]
target = wine['class']

from  sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size =0.2, random_state =42
)

# 검증 set 만들기
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size = 0.2, random_state=42
)

In [29]:
# 크기 확인
print(train_input.shape, test_input.shape)
print(sub_input.shape, val_input.shape)

(5197, 3) (1300, 3)
(4157, 3) (1040, 3)


In [30]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

# 과대적합

0.9971133028626413
0.864423076923077


In [31]:
# 교차검증
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.01093793, 0.0089817 , 0.00997972, 0.00853562, 0.0074532 ]), 'score_time': array([0.00238395, 0.00367022, 0.00210667, 0.00203085, 0.00381422]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [32]:
import numpy as np
print(np.mean(scores['test_score']))
#검증 테스트 점수 출력

0.855300214703487


In [33]:
from sklearn.model_selection import StratifiedKFold #StratifiedKFold :분류 모델의 타깃 클래스를 골고루 나누기 위해
scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))


0.855300214703487


In [34]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
score = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.855300214703487


### 하이퍼 파라미터 튜닝
#### grid search이용하여 하이퍼파라미터 탐색, 교차 검증 한번에 수행
두 매개변수를 동시에 바꿔가며 최적의 값 찾기

In [35]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease':[0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [36]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)
# gs객체의 best_estimator_속성에 저장

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'min_impurity_decrease': [0.0001, 0.0002, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0001


In [37]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))
# 최적의 매개변수는 best_params_속성에 저장

0.9615162593804117


In [38]:
print(gs.best_params_)
# 0.0001이 최적의 매개변수

{'min_impurity_decrease': 0.0001}


In [39]:
print(gs.cv_results_['mean_test_score']) 
# 인덱스 이용해서 최적값 구하기
print(gs.cv_results_['params'][gs.best_index_])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]
{'min_impurity_decrease': 0.0001}


In [45]:
params = {'min_impurity_decrease':np.arange(0.0001, 0.001, 0.0001),
          'max_depth':range(5,20,1),
          'min_samples_split':range(2, 100, 10)}

In [46]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'max_depth': range(5, 20), 'min_impurity_decrease': array([0.0001... 0.0009]), 'min_samples_split': range(2, 100, 10)}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,14
,min_samples_split,12
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,np.float64(0.0004)


In [48]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [50]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


### 랜덤 서치
매개변수의 값의 범위나 간격을 미리 정하기 어려울 때


In [51]:
from scipy.stats import uniform, randint
rgen = randint(0, 10)
rgen.rvs(10)
#0~10까지의 정수 10개 뽑기

array([7, 1, 0, 0, 0, 3, 2, 5, 6, 0])

In [53]:
#0~10까지 정수 1000개 뽑을 때, 각 뽑힌 숫자의 개수
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([102,  91,  99, 111,  98,  86, 100, 101, 100, 112]))

In [55]:
#실수 추출
ugen = uniform(0,1)
ugen.rvs(10)


array([0.18570397, 0.21715654, 0.05520003, 0.8639741 , 0.11379991,
       0.1219423 , 0.81150847, 0.98805742, 0.57339906, 0.25442582])

In [59]:
#탐색할 매개변수 범위
params = {'min_impurity_decrease' : uniform(0.0001, 0.001),
          'max_depth':randint(20,50),
          'min_samples_split': randint(2,25),
          'min_samples_leaf': randint(1,25),}

In [60]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, 
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_distributions,"{'max_depth': <scipy.stats....x799613797a00>, 'min_impurity_decrease': <scipy.stats....x79961326f970>, 'min_samples_leaf': <scipy.stats....x79961206cac0>, 'min_samples_split': <scipy.stats....x79961326e7d0>}"
,n_iter,100
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,39
,min_samples_split,13
,min_samples_leaf,7
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,np.float64(0....2546602601173)


In [63]:
print(rs.best_estimator_)

DecisionTreeClassifier(max_depth=39,
                       min_impurity_decrease=np.float64(0.00034102546602601173),
                       min_samples_leaf=7, min_samples_split=13,
                       random_state=42)


In [65]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [66]:
dt = rs.best_estimator_
print(dt.score(test_input, test_target))

0.86


# 랜덤 포레스트


In [69]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
wine = pd.read_csv('http://bit.ly/wine_csv_data')
data = wine[['alcohol', 'sugar', 'pH']]
target = wine['class']
train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size = 0.02, random_state=42
)

In [71]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, train_input, train_target,
                        return_train_score=True, n_jobs =-1)
print(np.mean(scores['train_score']), np.mean(score['test_score']))
# 과대적합

0.9973692306992568 0.8574181117533719


In [None]:
#특성 중요도 
rf.fit(train_input, train_target)
print(rf.feature_importances_)
# sugar특성이 나머지보다 더 높음, 그러나 하나의 특성에 과도하게 집중되었다고 보기는 어려움


[0.2333135  0.49919961 0.26748689]


In [76]:
#oob: 부트스트랩 샘플에 포함되지 않고 남는 샘플
#oob를 이용하여 부트스트랩 샘플로 훈련한 결정 트리 평가
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print(rf.oob_score_)

0.9040364378828334


In [79]:
#엑스트라 트리 : 랜덤 포레스트와의 차이) 부트스트랩 샘플을 이용하지 않는다.
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_jobs=-1, random_state =42)
scores = cross_validate(et, train_input, train_target,
                        return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))
#랜덤 포레스트와 비슷한 결과

0.9973692306992568 0.8960279984856351


In [81]:
et.fit(train_input, train_target)
print(et.feature_importances_)

[0.19126873 0.52373139 0.28499988]


# 그레이디언트 부스팅 
깊이야 얕은 결정 트리를 사용하여 이전 트리의 오차를 보완하는 방식

In [86]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score = True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.8827942013916111 0.8690147132633946


In [92]:
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2,
                                 random_state = 42)
scores = cross_validate(gb, train_input, train_target,
                        return_train_score = True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9356054727957132 0.8776516492148856


In [93]:
gb.fit(train_input, train_target)
print(gb.feature_importances_)

[0.15467136 0.68687889 0.15844975]


# 히스토그램 기반 그레이디언트 부스팅
그레디언트 부스팅의 속도와 성능을 개선한 것

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
hgb =HistGradientBoostingClassifier(random_state=42)
scores = cross_validate(hgb, train_input, train_target,
                        return_train_score=True)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))
#과대적합을 잘 억제하면서 그레이디언트 부스팅보다 조금 더 높은 성능 제공

0.9253965445693995 0.8756107095687391


In [100]:
#특성 중요도 확인
from sklearn.inspection import permutation_importance
hgb.fit(train_input, train_target)
result = permutation_importance(hgb, train_input, train_target,
                                n_repeats=10, random_state=42, n_jobs=-1)
print(result.importances_mean)

result = permutation_importance(hgb, test_input, test_target,
                                n_repeats = 10, random_state = 42, n_jobs =-1)
print(result.importances_mean)

[0.08708968 0.23604523 0.0784671 ]
[0.04923077 0.24307692 0.04      ]


In [102]:
#테스트 세트에서의 성능
hgb.score(test_input, test_target)

0.8692307692307693

In [110]:
#XGBoost이용, 와인 데이터의 교차 검증 점수 확인
from xgboost import XGBClassifier
xgb = XGBClassifier(tree_method = 'hist', random_state=42)
scores = cross_validate(xgb, train_input, train_target,
                        return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9489165980759445 0.8809519287804554


In [112]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(random_state=42)
scores = cross_validate(lgb, train_input, train_target,
                        return_train_score = True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

[LightGBM] [Info] Number of positive: 3846, number of negative: 1247
[LightGBM] [Info] Number of positive: 3846, number of negative: 1247
[LightGBM] [Info] Number of positive: 3846, number of negative: 1248
[LightGBM] [Info] Number of positive: 3847, number of negative: 1247
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 5093, number of used features: 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 382
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.755154 -> initscore=1.126293
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018123 seconds.
You can set `force_row_wise=true` to remove the overhead.
An