<a href="https://colab.research.google.com/github/msmsm104/TIL/blob/main/20220602/06_decision_tree_0602.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 의사결정나무 : Decision Tree

In [None]:
%config InlineBackend.figure_formats = {'png', 'retina'}

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
tennis_df = pd.read_csv('./data/play_tennis.csv')
tennis_df

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [None]:
len(tennis_df['outlook'].unique()) ## outlook feature의 class 개수 (3개)

3

In [None]:
tennis_df['play'].sort_values()[-5:] ## entropy가 0인 경우 

8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
Name: play, dtype: object

In [None]:
entropy_3 = np.sort(tennis_df["play"])[:10]
np.random.shuffle(entropy_3)
entropy_3 # entropy가 1인 경우

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No'],
      dtype=object)

---

 - 전체 데이터의 Entropy

In [None]:
np.unique(tennis_df['play'], return_counts=True) # ==> 9/ 14

(array(['No', 'Yes'], dtype=object), array([5, 9]))

In [None]:
## Before Entropy
before_entropy = -(9/14) * np.log2(9/14) - (1 - (9/14)) * np.log2(1 - (9/14))
before_entropy

0.9402859586706311

In [None]:
## outlook을 기준으로 분류

after_entropy_1 = 0
after_entropy_2 = -0.4 * np.log2(0.4) - (1 - 0.4) * np.log2(1 - 0.4)
after_entropy_3 = -0.6 * np.log2(0.6) - (1 - 0.6) * np.log2(1 - 0.6)
after_entropy_1, after_entropy_2, after_entropy_3

(0, 0.9709505944546686, 0.9709505944546686)

In [None]:
after_entropy = (4 * after_entropy_1 + 5 * after_entropy_2 + 5 * after_entropy_3) / (4 + 5 + 5)
after_entropy

0.6935361388961919

In [None]:
information_gain = before_entropy - after_entropy
information_gain

## 결과적으로 outlook feature를 root node로 잡고 분류를 진행한다면 
## Information Gain은 0.2467이 된다.

0.24674981977443922

## Decision Tree 예시

In [None]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier

In [None]:
iris = datasets.load_iris()

In [None]:
iris.keys(), iris.feature_names, iris.target_names

(dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']),
 ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 array(['setosa', 'versicolor', 'virginica'], dtype='<U10'))

In [None]:
datas = np.c_[iris.data, iris.target]
datas.shape

(150, 5)

In [None]:
columns = iris.feature_names + ['label']
columns

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)',
 'label']

In [None]:
df = pd.DataFrame(data=datas, columns=columns)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [None]:
X = df.drop(columns='label')
y = df['label']

In [None]:
model = DecisionTreeClassifier(max_depth=2, random_state=1)
model.fit(X, y)

DecisionTreeClassifier(max_depth=2, random_state=1)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score

## 데이터의 양이 적기때문에 학습 성능에 대해서만 평가
pred = model.predict(X)
confusion_matrix(y, pred)

array([[50,  0,  0],
       [ 0, 49,  1],
       [ 0,  5, 45]])

In [None]:
model.score(X, y)

0.96

In [None]:
y, pred

(0      0.0
 1      0.0
 2      0.0
 3      0.0
 4      0.0
       ... 
 145    2.0
 146    2.0
 147    2.0
 148    2.0
 149    2.0
 Name: label, Length: 150, dtype: float64,
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
        2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
        1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 1., 1., 2.,
        2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.]))

In [None]:
f1_score(y, pred, average=None)

array([1.        , 0.94230769, 0.9375    ])