In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# TODO: 결정트리의 매개변수 값을 바꿔가며 가장 좋은 성능이 나오는 모델을 찾아보기

import pandas as pd

wine = pd.read_csv("wine.csv")

data = wine[['alcohol','sugar','pH']]
target = wine['class']

from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(data, target,
                                                                      test_size=0.2, random_state=42)

from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [4]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [5]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [6]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [7]:
# 최적의 매개변수 찾아보기
import numpy as np
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [8]:
# 다양한 매개변수 탐색해보기

params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001), 
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [9]:
# TODO: # 다양한 매개변수로 훈련
gs = gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

### 랜덤 서치

In [10]:
from scipy.stats import uniform, randint

In [11]:
# 다양한 하이퍼파라미터 조합
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [12]:
# TODO: 랜덤서치로 파라미터 찾기
from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

In [13]:
# TODO:테스트 성능 확인해보기
dt = gs.best_estimator_

print(dt.score(test_input, test_target))

0.86
