## 의사결정 나무 실습


### 데이터 불러오기 및 분할하기


In [37]:
from sklearn import datasets
import numpy as np
data = datasets.load_breast_cancer()

x = data['data']
y = data['target']

print(x.shape, y.shape)

(569, 30) (569,)


In [38]:
from sklearn.model_selection import train_test_split

# 데이터 나누기 - 6:2:2 비율
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1)

print(x_train.shape, x_val.shape, x_test.shape)

(341, 30) (114, 30) (114, 30)


### 모델 학습 및 평가하기

Scikit-learn의 [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html?highlight=decisiontree#sklearn.tree.DecisionTreeClassifier) 모델을 사용하면 수업 시간에 다루었던 의사결정 나무 모델을 만들어 볼 수 있다.

In [39]:
from sklearn.tree import DecisionTreeClassifier

In [40]:
# 의사결정 나무 모델 학습하기

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [41]:
# To-Do : Training accuracy 및 confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

pred_train = tree.predict(x_train)
print("Train Accuracy : {}".format(accuracy_score(y_train, pred_train)))
confusion_matrix(y_train, pred_train)

Train Accuracy : 1.0


array([[130,   0],
       [  0, 211]])

In [42]:
# To-Do : Validation accuracy 및 confusion matrix
pred_val = tree.predict(x_val)
print("validation Accuracy : {}".format(accuracy_score(y_val, pred_val)))
confusion_matrix(y_val, pred_val)

validation Accuracy : 0.9035087719298246


array([[34,  6],
       [ 5, 69]])

In [43]:
# To-Do : 보다 좋은 의사결정 나무 모델 만들기

pruned_tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4)
pruned_tree.fit(x_train, y_train)
pruned_tree.score(x_train, y_train)

0.9824046920821115

In [44]:
pred_prunned_train = pruned_tree.predict(x_train)
print("Train Accuracy : {}".format(accuracy_score(y_train, pred_prunned_train)))
confusion_matrix(y_train, pred_prunned_train)

Train Accuracy : 0.9824046920821115


array([[129,   1],
       [  5, 206]])

In [45]:
pred_prunned_valid = pruned_tree.predict(x_val)
print("validation Accuracy : {}".format(accuracy_score(y_val, pred_prunned_valid)))
confusion_matrix(y_val, pred_prunned_valid)

validation Accuracy : 0.9649122807017544


array([[38,  2],
       [ 2, 72]])