In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [51]:
df = pd.read_csv("fertility.csv")

In [52]:
df.head()

Unnamed: 0,season,age,child diseases,accident or trauma,surgical intervention,high fevers in last year,alcohol consumption,smoking habit,hours spent sitting,diagnosis
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,N
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,O
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5,N
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,N
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5,O


In [53]:
df.tail()

Unnamed: 0,season,age,child diseases,accident or trauma,surgical intervention,high fevers in last year,alcohol consumption,smoking habit,hours spent sitting,diagnosis
95,-1.0,0.67,1,0,0,0,1.0,-1,0.5,N
96,-1.0,0.61,1,0,0,0,0.8,0,0.5,N
97,-1.0,0.67,1,1,1,0,1.0,-1,0.31,N
98,-1.0,0.64,1,0,1,0,1.0,0,0.19,N
99,-1.0,0.69,0,1,1,0,0.6,-1,0.19,N


In [54]:
print("* Diagnosis:", df["diagnosis"].unique(), sep="\n")

* Diagnosis:
['N' 'O']


In [55]:
# Mapping target string (diagnosis) menjadi target integer

def encode_target(df, target_column):
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

In [56]:
df2, targets = encode_target(df, "diagnosis")

In [57]:
df2.head()

Unnamed: 0,season,age,child diseases,accident or trauma,surgical intervention,high fevers in last year,alcohol consumption,smoking habit,hours spent sitting,diagnosis,target
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,N,0
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,O,1
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5,N,0
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,N,0
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5,O,1


In [87]:
atribut = list(df2.columns[:9])
print("* Atribut:", atribut, sep="\n")

* Atribut:
['season', 'age', 'child diseases', 'accident or trauma', 'surgical intervention', 'high fevers in last year', 'alcohol consumption', 'smoking habit', 'hours spent sitting']


In [88]:
y = df2["target"]
X = df2[atribut]

In [89]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.25)

In [90]:
dt = DecisionTreeClassifier(criterion="entropy")

In [91]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [92]:
print('Akurasi training set: {:.3}'.format(dt.score(X_train, y_train)))
print('Akurasi test set: {:.3f}'.format(dt.score(X_test, y_test)))

Akurasi training set: 1.0
Akurasi test set: 0.840


In [93]:
# Visualisasi

export_graphviz(dt, out_file='fertility_tree.dot', class_names=['Normal', 'Altered'], feature_names=atribut, filled=True)

In [94]:
print(y_test)

73    0
47    0
70    1
43    0
10    0
12    0
63    0
67    0
55    0
93    1
11    0
88    0
42    0
40    0
13    0
94    0
2     0
9     0
84    1
54    0
27    1
83    0
19    1
45    0
29    1
Name: target, dtype: int64


In [95]:
prediksi = dt.predict(X_test)

In [112]:
for i in prediksi:
    print(i)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1


In [97]:
def denormalize_age(normalized_age):
    return ((normalized_age*1000)-1)*(36-18) / 999 + 18

def denormalize_hour(normalized_hour):
    return ((normalized_hour*1000)-1)*(16-1) / 999 + 1