<a href="https://colab.research.google.com/github/ljw-0108/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [11]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [12]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [13]:
# wine 전체 행의 개수 확인
print(wine.head)

<bound method NDFrame.head of       alcohol  sugar    pH  class
0         9.4    1.9  3.51    0.0
1         9.8    2.6  3.20    0.0
2         9.8    2.3  3.26    0.0
3         9.8    1.9  3.16    0.0
4         9.4    1.9  3.51    0.0
...       ...    ...   ...    ...
6492     11.2    1.6  3.27    1.0
6493      9.6    8.0  3.15    1.0
6494      9.4    1.2  2.99    1.0
6495     12.8    1.1  3.34    1.0
6496     11.8    0.8  3.26    1.0

[6497 rows x 4 columns]>


In [14]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [15]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [16]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()  #타겟값으로 선언

In [17]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42) #훈련 데이터와 테스트 데이터가 8:2로 섞임

In [18]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [19]:
print(sub_input.shape, val_input.shape) #데이터 갯수 확인

(4157, 3) (1040, 3)


In [20]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target)) #훈련데이터로 점수 구하기
print(dt.score(val_input, val_target)) #검증데이터로 점수 구하기

0.9971133028626413
0.864423076923077


## 교차 검증

In [21]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores) #dt라는 모델을 넣어 교차검증

{'fit_time': array([0.00861621, 0.00937963, 0.00820422, 0.00808001, 0.00763464]), 'score_time': array([0.00138283, 0.00098395, 0.00096369, 0.00105929, 0.00094533]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [22]:
import numpy as np

print(np.mean(scores['test_score'])) #교차검증에서 나온 데이터로 평균을 구함

0.855300214703487


In [23]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [24]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [25]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]} #하이퍼 파라미터값 정의

In [26]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [27]:
gs.fit(train_input, train_target) #그리드서치 학습

In [28]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target)) #최적의 모델을 반환

0.9615162593804117


In [29]:
print(gs.best_params_) #스코어값 출력

{'min_impurity_decrease': 0.0001}


In [30]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [31]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index]) #베스트 인덱스 뽑기

{'min_impurity_decrease': 0.0001}


In [32]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001), #범위와 사용할 하이퍼파라미터 정의
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }#배열로 정의

In [33]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [34]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [35]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [36]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.01252704, 0.01546211, 0.01644869, ..., 0.0094471 , 0.00820103,
       0.01176538])

### 랜덤 서치

In [37]:
from scipy.stats import uniform, randint

In [38]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([9, 0, 0, 0, 2, 7, 0, 7, 0, 1])

In [39]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 96, 106,  98,  90,  99,  97, 103, 109,  99, 103]))

In [40]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.05712578, 0.2954468 , 0.32911575, 0.34971948, 0.84617387,
       0.44040336, 0.44744231, 0.53970081, 0.93111062, 0.71166188])

In [41]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001), #범위에서만 뽑아 최적의 하이퍼 파라미터 찾기
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [42]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42) #100번 샘플링
rs.fit(train_input, train_target)

In [43]:
print(rs.best_params_) #최적의 파라미터 값

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [44]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [45]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target)) #학습시간 구함

0.86


In [46]:
rs.cv_results_['mean_fit_time']

array([0.00684519, 0.00681667, 0.00837679, 0.00937438, 0.00667496,
       0.0081234 , 0.0066813 , 0.00675125, 0.00676699, 0.00689783,
       0.0067358 , 0.00656977, 0.00672798, 0.00710402, 0.00667157,
       0.00722919, 0.00668335, 0.0072021 , 0.00829468, 0.00706811,
       0.0122508 , 0.00679903, 0.00677562, 0.00894761, 0.00713205,
       0.00748096, 0.00713162, 0.00838256, 0.00704556, 0.00667138,
       0.00667982, 0.00706921, 0.00684505, 0.00784993, 0.00785284,
       0.00716038, 0.00690556, 0.0088172 , 0.00697103, 0.00688534,
       0.00788989, 0.00737281, 0.00675554, 0.0083149 , 0.00750141,
       0.00681362, 0.00655918, 0.00739503, 0.00744653, 0.00678382,
       0.00750318, 0.00754004, 0.0067431 , 0.00701528, 0.00659895,
       0.00710092, 0.00682039, 0.00763297, 0.00698032, 0.00784402,
       0.00807085, 0.0068397 , 0.00679822, 0.00794969, 0.00651693,
       0.00877934, 0.00661721, 0.00694666, 0.00878634, 0.00818224,
       0.00662951, 0.00825949, 0.00719085, 0.00663886, 0.00711

In [47]:
print(np.mean(rs.cv_results_['mean_fit_time'])) #학습시간 평균

0.007271088123321533


### 결정트리 분할 옵션 변경

In [48]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42) #랜덤하게 노드 분할
rs2.fit(train_input, train_target)

In [49]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target)) #교차 검증의 최적의 파라미터 값

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [50]:
rs2.cv_results_['mean_fit_time']

array([0.00451183, 0.00368581, 0.00381479, 0.00391374, 0.00333476,
       0.00364504, 0.00291138, 0.00285683, 0.00384512, 0.0038785 ,
       0.00321307, 0.00556698, 0.00315275, 0.00707517, 0.00297556,
       0.00319982, 0.00296831, 0.00434551, 0.0032578 , 0.00302048,
       0.00312238, 0.00305662, 0.0030777 , 0.00436759, 0.00284557,
       0.00474949, 0.00292034, 0.00310378, 0.00305543, 0.00305872,
       0.00331321, 0.00316634, 0.00290051, 0.00345898, 0.00343895,
       0.0031734 , 0.00510092, 0.00385361, 0.00315599, 0.00301514,
       0.00303841, 0.00326996, 0.00314398, 0.00377469, 0.00318952,
       0.00317836, 0.00311399, 0.00321164, 0.00352545, 0.00476389,
       0.00455456, 0.00307722, 0.00300293, 0.002878  , 0.00302181,
       0.00319872, 0.00312042, 0.00325651, 0.0031487 , 0.00304151,
       0.003333  , 0.00456481, 0.00302615, 0.00388536, 0.00306783,
       0.00292807, 0.00301085, 0.00322952, 0.0031023 , 0.00302219,
       0.00315943, 0.00310388, 0.00320601, 0.00298076, 0.00465

In [51]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.003434837341308594


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.
답:splitter='random'으로 하여 무작위로 분할함,best_estimator_를 변수로 저장