In [180]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz


In [181]:
data = [["<=30","High","No","Fair","No"],
       ["<=30","High","No","Excellent","No"],
       ["31...40","High","No","Fair","Yes"],
       [">40","Medium","No","Fair","Yes"],
       [">40","Low","Yes","Fair","Yes"],
       [">40","Low","Yes","Excellent","No"],
       ["31...40","Low","Yes","Excellent","Yes"],
       ["<=30","Medium","No","Fair","No"],
       ["<=30","Low","Yes","Fair","Yes"],
       [">40","Medium","Yes","Fair","Yes"],
       ["<=30","Medium","Yes","Excellent","Yes"],
       ["31...40","Medium","No","Excellent","Yes"],
       ["31...40","High","Yes","Fair","Yes"],
       [">40","Medium","No","Excellent","No"],
       ["<=30","Medium","No","Excellent","No"],
       ["<=30","Low","No","Fair","No"],
       ["<=30","Low","No","Excellent","No"],
       ["31...40","Low","Yes","Fair","Yes"],
       [">40","Medium","Yes","Excellent","Yes"],
       ["31...40","High","No","Excellent","Yes"],
       ]

df = pd.DataFrame(
    data=data,
    index=range(1,len(data)+1),
    columns=["Age", "Income", "Student", "Credit_rating", "Buys_computer"]
)
df

Unnamed: 0,Age,Income,Student,Credit_rating,Buys_computer
1,<=30,High,No,Fair,No
2,<=30,High,No,Excellent,No
3,31...40,High,No,Fair,Yes
4,>40,Medium,No,Fair,Yes
5,>40,Low,Yes,Fair,Yes
6,>40,Low,Yes,Excellent,No
7,31...40,Low,Yes,Excellent,Yes
8,<=30,Medium,No,Fair,No
9,<=30,Low,Yes,Fair,Yes
10,>40,Medium,Yes,Fair,Yes


In [182]:
print("Buys_computer total: ", df['Buys_computer'].count())
print("Buys_computer yes:   ", df[df['Buys_computer'] == 'Yes']['Buys_computer'].count())
print("Buys_computer no:    ", df[df['Buys_computer'] == 'No']['Buys_computer'].count())


Buys_computer total:  20
Buys_computer yes:    12
Buys_computer no:     8


## Better representation of the groups
hier konkret für 'Credit_rating'

In [183]:
node = df.groupby(
    ['Credit_rating', 'Buys_computer']
)['Buys_computer']\
    .count()\
    .unstack(fill_value=0).stack()\
    .to_frame('count')\
    .sort_values(by=[df.columns[3] ,'Buys_computer'], ascending=[True,False])\
    .reset_index()

print("total count : ", node['count'].sum())
node

total count :  20


Unnamed: 0,Credit_rating,Buys_computer,count
0,Excellent,Yes,5
1,Excellent,No,5
2,Fair,Yes,7
3,Fair,No,3


## Select data and classes

In [184]:
import warnings
warnings.filterwarnings("ignore")
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

## Data encoding for fit function
Encode the the string values to int because the fit function doesn't accept strings but ints.

In [185]:
le = LabelEncoder()
le.fit(x['Age'].astype(str))
x['Age'] = le.transform(x['Age'].astype(str))
le.fit(x['Income'].astype(str))
x['Income'] = le.transform(x['Income'].astype(str))
le.fit(x['Student'].astype(str))
x['Student'] = le.transform(x['Student'].astype(str))
le.fit(x['Credit_rating'].astype(str))
x['Credit_rating'] = le.transform(x['Credit_rating'].astype(str))

## Build the decision tree

In [186]:
tree_clf = DecisionTreeClassifier(max_depth=4,criterion='entropy')
tree_clf.fit(x, y)


DecisionTreeClassifier(criterion='entropy', max_depth=4)

## Export the graph

In [187]:
print(export_graphviz(
         tree_clf,
         feature_names=list(x.columns),
         class_names=tree_clf.classes_,
         rounded=True,
         filled=True
 ))

digraph Tree {
node [shape=box, style="filled, rounded", color="black", fontname=helvetica] ;
edge [fontname=helvetica] ;
0 [label="Age <= 0.5\nentropy = 0.971\nsamples = 20\nvalue = [8, 12]\nclass = Yes", fillcolor="#bddef6"] ;
1 [label="entropy = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = Yes", fillcolor="#399de5"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="Student <= 0.5\nentropy = 0.985\nsamples = 14\nvalue = [8, 6]\nclass = No", fillcolor="#f8e0ce"] ;
0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
3 [label="Age <= 1.5\nentropy = 0.544\nsamples = 8\nvalue = [7, 1]\nclass = No", fillcolor="#e99355"] ;
2 -> 3 ;
4 [label="entropy = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = No", fillcolor="#e58139"] ;
3 -> 4 ;
5 [label="Credit_rating <= 0.5\nentropy = 1.0\nsamples = 2\nvalue = [1, 1]\nclass = No", fillcolor="#ffffff"] ;
3 -> 5 ;
6 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = No", fillcolor="#e58139"] ;
5 -> 6 ;
7 [label="ent

##Drawing of the graph is in the pdf doc




