In [33]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import tree
from sklearn.tree import export_text
from id3 import Id3Estimator
from id3 import export_text as id3export_text

# Iris Datasets

In [34]:
# iris Features: [Sepal Length, Sepal Width, Petal Length, Petal Width]
# iris target: {Setosa, Versicolour, Virginica}
iris = datasets.load_iris()
X = iris.data
y = iris.target
print(iris.target_names)
print(y)

['setosa' 'versicolor' 'virginica']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


## Decision Tree Classifier

In [35]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

r = export_text(clf, feature_names=iris.feature_names)
print(r)

|--- petal width (cm) <= 0.80
|   |--- class: 0
|--- petal width (cm) >  0.80
|   |--- petal width (cm) <= 1.75
|   |   |--- petal length (cm) <= 4.95
|   |   |   |--- petal width (cm) <= 1.65
|   |   |   |   |--- class: 1
|   |   |   |--- petal width (cm) >  1.65
|   |   |   |   |--- class: 2
|   |   |--- petal length (cm) >  4.95
|   |   |   |--- petal width (cm) <= 1.55
|   |   |   |   |--- class: 2
|   |   |   |--- petal width (cm) >  1.55
|   |   |   |   |--- petal length (cm) <= 5.45
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- petal length (cm) >  5.45
|   |   |   |   |   |--- class: 2
|   |--- petal width (cm) >  1.75
|   |   |--- petal length (cm) <= 4.85
|   |   |   |--- sepal width (cm) <= 3.10
|   |   |   |   |--- class: 2
|   |   |   |--- sepal width (cm) >  3.10
|   |   |   |   |--- class: 1
|   |   |--- petal length (cm) >  4.85
|   |   |   |--- class: 2



## Decision Tree id3

In [36]:
estimator = Id3Estimator()
estimator = estimator.fit(X, y)

r = id3export_text(estimator.tree_, feature_names=iris.feature_names)
print(r)


petal length (cm) <=2.45: 0 (50) 
petal length (cm) >2.45
|   petal width (cm) <=1.75
|   |   sepal length (cm) <=7.10
|   |   |   sepal width (cm) <=2.85: 1 (27/4) 
|   |   |   sepal width (cm) >2.85: 1 (22) 
|   |   sepal length (cm) >7.10: 2 (1) 
|   petal width (cm) >1.75
|   |   sepal length (cm) <=5.95
|   |   |   sepal width (cm) <=3.10: 2 (6) 
|   |   |   sepal width (cm) >3.10: 1 (1) 
|   |   sepal length (cm) >5.95: 2 (39) 



# play-tennis Datasets

In [37]:
import pandas as pd
from sklearn import preprocessing

In [38]:
# Load and preprocessing data
df = pd.read_csv('play-tennis.csv')

print(df)
le_list = []
for col in df:
    le = preprocessing.LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
    le_list.append(le)

headers = list(df.columns[1:])
feature_names = headers[:-1]

    Day   Outlook Temperature Humidity    Wind PlayTennis
0    D1     Sunny         Hot     High    Weak         No
1    D2     Sunny         Hot     High  Strong         No
2    D3  Overcast         Hot     High    Weak        Yes
3    D4      Rain        Mild     High    Weak        Yes
4    D5      Rain        Cool   Normal    Weak        Yes
5    D6      Rain        Cool   Normal  Strong         No
6    D7  Overcast        Cool   Normal  Strong        Yes
7    D8     Sunny        Mild     High    Weak         No
8    D9     Sunny        Cool   Normal    Weak        Yes
9   D10      Rain        Mild   Normal    Weak        Yes
10  D11     Sunny        Mild   Normal  Strong        Yes
11  D12  Overcast        Mild     High  Strong        Yes
12  D13  Overcast         Hot   Normal    Weak        Yes
13  D14      Rain        Mild     High  Strong         No


## Decision Tree Classifier

In [39]:
clf_tennis = tree.DecisionTreeClassifier()
clf_tennis.X_encoders_ = le_list[1:-1]
clf_tennis.y_encoders_ = le_list[-1]
clf_tennis = clf_tennis.fit(df[feature_names], df[headers[-1]])

r = export_text(clf_tennis, feature_names=feature_names)
print(r)

|--- Outlook <= 0.50
|   |--- class: 1
|--- Outlook >  0.50
|   |--- Humidity <= 0.50
|   |   |--- Outlook <= 1.50
|   |   |   |--- Wind <= 0.50
|   |   |   |   |--- class: 0
|   |   |   |--- Wind >  0.50
|   |   |   |   |--- class: 1
|   |   |--- Outlook >  1.50
|   |   |   |--- class: 0
|   |--- Humidity >  0.50
|   |   |--- Wind <= 0.50
|   |   |   |--- Temperature <= 1.00
|   |   |   |   |--- class: 0
|   |   |   |--- Temperature >  1.00
|   |   |   |   |--- class: 1
|   |   |--- Wind >  0.50
|   |   |   |--- class: 1



## Decision Tree id3

In [40]:
estimator = Id3Estimator()
estimator.X_encoders_ = le_list[1:-1]
estimator.y_encoders_ = le_list[-1]

estimator = estimator.fit(df[feature_names], df[headers[-1]])

r = id3export_text(estimator.tree_, feature_names=feature_names)
print(r)


Outlook <=0.50: 1 (4) 
Outlook >0.50
|   Humidity <=0.50
|   |   Temperature <=1.50: 0 (2) 
|   |   Temperature >1.50
|   |   |   Wind <=0.50: 0 (1) 
|   |   |   Wind >0.50: 0 (1/1) 
|   Humidity >0.50
|   |   Wind <=0.50
|   |   |   Temperature <=1.00: 0 (1) 
|   |   |   Temperature >1.00: 1 (1) 
|   |   Wind >0.50: 1 (3) 



# Tugas Persamaan dan Perbedaan

DecisionTreeClassifier (optimised version of CART algo) vs di buku

|Perbedaan|DecisionTreeClassifier (CART)|ID3|
| ------------- |-------------|-----|
| Penentuan atribut terbaik | Gini Impurity (default) or Information gain | Highest Information gain |
| Penanganan label dari cabang setiap nilai atribut | Apabila (depth >= max_depth) atau (n_node_samples < min_samples_split) atau (n_node_samples < 2 * min_samples_leaf) atau (weighted_n_node_samples < 2 * min_weight_leaf) or (impurity <= min_impurity_split) atau (all sample belong to 1 class) atau (split.improvement + EPSILON < min_impurity_decrease) maka node tersebut adalah leaf node (target), Jika tidak maka dilakukan split dengan attribute tertentu | all Example should belong to one class (100% pure) |
| Penentuan label jika examples kosong di cabang tersebut | Dipilih kelas target paling banyak | Dipilih kelas target paling banyak |
| Penanganan atribut kontinu | Pada CART ditangani, namun implementasi DecisionTreeClassifier pada sklearn tidak menangani atribut kontinu | Tidak ditangani |
| Penanganan atribut dengan missing values | Mengabaikan missing value ketika mensplit | Mengabaikan missing value |
| Pruning dan parameter confidence | Post-Pruned, complexity pruning | Tidak ada pruning |


|Perbedaan|decision-tree-id3|ID3|
| ------------- |-------------|-----|
| Penentuan atribut terbaik | highest Information gain (optional: gain ratio) | Highest Information gain |
| Penanganan label dari cabang setiap nilai atribut | Apabila (jumlah sampel < jumlah minimal sampel split (default = 1)) atau (depth tree >= defined max_depth) atau (entrophy_decrease < defined minimum_entropy_decrease (default = 0)) maka node tersebut adalah leaf node (target), Jika tidak maka dilakukan split dengan attribute tertentu | all Example should belong to one class (100% pure) |
| Penentuan label jika examples kosong di cabang tersebut | Dipilih kelas target paling banyak | Dipilih kelas target paling banyak |
| Penanganan atribut kontinu | Ditangani, float dijadikan int sebelum splitting | Tidak ditangani |
| Penanganan atribut dengan missing values | Mengabaikan missing value ketika mensplit | Mengabaikan missing value |
| Pruning dan parameter confidence | Post-Pruned | Tidak ada pruning |
