<a href="https://colab.research.google.com/github/ljw-0108/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [1]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [2]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
# wine 전체 행의 개수 확인
print(wine.head)

<bound method NDFrame.head of       alcohol  sugar    pH  class
0         9.4    1.9  3.51    0.0
1         9.8    2.6  3.20    0.0
2         9.8    2.3  3.26    0.0
3         9.8    1.9  3.16    0.0
4         9.4    1.9  3.51    0.0
...       ...    ...   ...    ...
6492     11.2    1.6  3.27    1.0
6493      9.6    8.0  3.15    1.0
6494      9.4    1.2  2.99    1.0
6495     12.8    1.1  3.34    1.0
6496     11.8    0.8  3.26    1.0

[6497 rows x 4 columns]>


In [4]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [5]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [6]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()  #타겟값으로 선언

In [7]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42) #훈련 데이터와 테스트 데이터가 8:2로 섞임

In [8]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [9]:
print(sub_input.shape, val_input.shape) #데이터 갯수 확인

(4157, 3) (1040, 3)


In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target)) #훈련데이터로 점수 구하기
print(dt.score(val_input, val_target)) #검증데이터로 점수 구하기

0.9971133028626413
0.864423076923077


## 교차 검증

In [11]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores) #dt라는 모델을 넣어 교차검증

{'fit_time': array([0.0410378 , 0.03913665, 0.04195142, 0.03195882, 0.04562283]), 'score_time': array([0.00452805, 0.01315331, 0.00958586, 0.00983858, 0.00318909]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [12]:
import numpy as np

print(np.mean(scores['test_score'])) #교차검증에서 나온 데이터로 평균을 구함

0.855300214703487


In [13]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [14]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [15]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]} #하이퍼 파라미터값 정의

In [16]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [17]:
gs.fit(train_input, train_target) #그리드서치 학습

In [18]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target)) #최적의 모델을 반환

0.9615162593804117


In [19]:
print(gs.best_params_) #스코어값 출력

{'min_impurity_decrease': 0.0001}


In [20]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [21]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index]) #베스트 인덱스 뽑기

{'min_impurity_decrease': 0.0001}


In [22]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001), #범위와 사용할 하이퍼파라미터 정의
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }#배열로 정의

In [23]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [24]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [25]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [26]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.01898284, 0.03387661, 0.04196925, ..., 0.00702009, 0.00702171,
       0.00693207])

### 랜덤 서치

In [27]:
from scipy.stats import uniform, randint

In [28]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([1, 5, 2, 4, 3, 4, 5, 1, 3, 4])

In [29]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([106,  98,  98,  93,  85,  97,  98, 109,  88, 128]))

In [30]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.97735454, 0.5841985 , 0.05537276, 0.70022008, 0.59418485,
       0.96908276, 0.39967614, 0.9866863 , 0.63936469, 0.17763533])

In [31]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001), #범위에서만 뽑아 최적의 하이퍼 파라미터 찾기
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [32]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42) #100번 샘플링
rs.fit(train_input, train_target)

In [33]:
print(rs.best_params_) #최적의 파라미터 값

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [34]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [35]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target)) #학습시간 구함

0.86


In [36]:
rs.cv_results_['mean_fit_time']

array([0.00718288, 0.00725112, 0.00751333, 0.00808396, 0.00793772,
       0.01123257, 0.00792418, 0.00788679, 0.0080843 , 0.00738297,
       0.00707812, 0.00680175, 0.00776691, 0.00742445, 0.00694919,
       0.00805874, 0.00711012, 0.0072845 , 0.00836906, 0.00709119,
       0.00811796, 0.00700645, 0.00721598, 0.00748019, 0.00962243,
       0.00855379, 0.00724268, 0.00768933, 0.00762916, 0.00743833,
       0.00819149, 0.00735898, 0.00729146, 0.00798163, 0.00852151,
       0.00803413, 0.007443  , 0.01030612, 0.00707006, 0.00718575,
       0.00737543, 0.00830712, 0.00688167, 0.00817299, 0.00856266,
       0.00772033, 0.00732241, 0.00805006, 0.01342735, 0.00770764,
       0.00739217, 0.00807314, 0.00686908, 0.00668616, 0.00697932,
       0.00827785, 0.0070271 , 0.00780144, 0.00706711, 0.0114512 ,
       0.00813417, 0.00951285, 0.01091566, 0.0112196 , 0.01044469,
       0.00866566, 0.00924954, 0.01504622, 0.0119277 , 0.00882845,
       0.01558819, 0.01593366, 0.01311646, 0.01554151, 0.01605

In [37]:
print(np.mean(rs.cv_results_['mean_fit_time'])) #학습시간 평균

0.010180994033813477


### 결정트리 분할 옵션 변경

In [38]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42) #랜덤하게 노드 분할
rs2.fit(train_input, train_target)

In [39]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target)) #교차 검증의 최적의 파라미터 값

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [40]:
rs2.cv_results_['mean_fit_time']

array([0.00505524, 0.00658507, 0.00634198, 0.00687375, 0.00474558,
       0.0112874 , 0.00595412, 0.00912304, 0.00871053, 0.00702319,
       0.00560813, 0.00772281, 0.00647998, 0.007828  , 0.00705109,
       0.00712109, 0.00645905, 0.00576458, 0.00822906, 0.00973148,
       0.00701313, 0.00599117, 0.00577703, 0.00510793, 0.00784502,
       0.0081039 , 0.00781603, 0.00880747, 0.00458932, 0.00529628,
       0.00469942, 0.00577312, 0.00737796, 0.01265521, 0.00875897,
       0.00363111, 0.00544243, 0.0078383 , 0.00781555, 0.00778656,
       0.00803099, 0.01104426, 0.00703721, 0.00591445, 0.00716057,
       0.00627761, 0.00579448, 0.00885463, 0.00470543, 0.00402341,
       0.00416193, 0.00393472, 0.00359011, 0.00340047, 0.00427675,
       0.00373106, 0.00372305, 0.00386744, 0.00398111, 0.00426102,
       0.00391774, 0.003406  , 0.00351729, 0.00429487, 0.00342159,
       0.00366621, 0.0034245 , 0.00349998, 0.00356307, 0.00432563,
       0.00343108, 0.00337276, 0.00400229, 0.00340495, 0.00373

In [41]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.0054708018302917485


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.
답:splitter='random'으로 하여 무작위로 분할함,best_estimator_를 변수로 저장