In [19]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

In [20]:
X = [4, 4]
entropy(X, base=2)

1.0

In [21]:
EXAMPLE_1 = '../data/lec9/Example1.xlsx'
tr_df = pd.read_excel(EXAMPLE_1, sheet_name='Train')

In [22]:
tr_df.head(20)

Unnamed: 0,Instance,Outlook,Temperature,Humidity,Windy,Play
0,1,sunny,hot,high,False,no
1,2,sunny,hot,high,True,no
2,3,overcast,hot,high,False,yes
3,4,rainy,mild,high,False,yes
4,5,rainy,cool,normal,False,yes
5,6,rainy,cool,normal,True,no
6,7,overcast,cool,normal,True,yes
7,8,sunny,mild,high,False,no
8,9,sunny,cool,normal,False,yes
9,10,rainy,mild,normal,False,yes


In [23]:
all_ent = entropy(tr_df['Play'].value_counts(), base=2)
all_ent

0.940285958670631

In [24]:
sunny = tr_df[tr_df['Outlook'] == 'sunny']
rainy = tr_df[tr_df['Outlook'] == 'rainy']
overcast = tr_df[tr_df['Outlook'] == 'overcast']

sunny_ent = entropy(sunny['Play'].value_counts(), base=2)
rainy_ent = entropy(rainy['Play'].value_counts(), base=2)
overcast_ent = entropy(overcast['Play'].value_counts(), base=2)

print('Sunny:', sunny_ent)
print('Rainy:', rainy_ent)
print('Overcast:', overcast_ent)

Sunny: 0.9709505944546688
Rainy: 0.9709505944546688
Overcast: 0.0


In [25]:
(5 * 0.97 + 5 * 0.97) / 14

0.6928571428571428

In [26]:
def get_ent(indx, all_ent, _df):
    pivot = _df.pivot_table(index=indx, columns='Play', values='Instance', aggfunc='count', fill_value=0)
    e = []
    for i, row in pivot.iterrows():
        e.append(entropy(row, base=2))

    pivot['Entropy'] = e
    pivot['sum'] = pivot['yes'] + pivot['no']
    pivot['entropy*n'] = pivot['sum'] * pivot['Entropy']
    ent = pivot['entropy*n'].sum() / pivot['sum'].sum()
#     print(pivot)
#     print('Entropy by {}: {}'.format(indx, ent))
#     print('Information gain by {}: {}'.format(indx, (all_ent - ent)))
#     print('\n')
    return ent

In [27]:
feat = tr_df.columns[1: -1]
_all_vars = [get_ent(x, all_ent, tr_df) for x in feat]

_min_ent = min(_all_vars)

print('Attribute with highest information gain: {}, {}'.format(
    feat[_all_vars.index(_min_ent)], (all_ent - _min_ent)))

Attribute with highest information gain: Outlook, 0.246749819774439


In [28]:
### Sunny branch
_sunny_all_vars = [get_ent(x, all_ent, sunny) for x in feat]

_sunny_min_ent = min(_sunny_all_vars)
print('Attribute with highest information gain: {}, {}'.format(
    feat[_sunny_all_vars.index(_sunny_min_ent)], (sunny_ent - _sunny_min_ent)))



Attribute with highest information gain: Humidity, 0.9709505944546688


In [29]:
### Sunny branch
_rainy_all_vars = [get_ent(x, all_ent, rainy) for x in feat]

_rainy_min_ent = min(_rainy_all_vars)
print('Attribute with highest information gain: {}, {}'.format(
    feat[_rainy_all_vars.index(_rainy_min_ent)], (rainy_ent - _rainy_min_ent)))



Attribute with highest information gain: Windy, 0.9709505944546688


In [40]:
# Implementation

import graphviz, pydot
from sklearn import tree

EXAMPLE_1 = '../data/lec9/Example1.xlsx'
new_df = pd.read_excel(EXAMPLE_1, sheet_name='Train')

# If the data is continous, USE VARIANCE as criterion
# if categorical, use entropy or gini
clf = tree.DecisionTreeClassifier(criterion='entropy')

In [31]:
# Storage of conversions
lbl_d = {}
atts = new_df.columns[1:-1]

num_df = pd.DataFrame(index=new_df.index)

for _a in atts:
    num_df[_a], lbl_d[_a] = pd.factorize(new_df[_a], sort=True)
    
num_df['Play'], lbl_d['Play'] = pd.factorize(new_df['Play'], sort=True)

X = num_df[atts]
y = num_df['Play']

In [37]:
clf_app = clf.fit(X, y)

In [38]:
dot_data = tree.export_graphviz(
    clf_app, 
    out_file='tree.dot', 
    feature_names=atts, class_names=lbl_d['Play'])

In [39]:
(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')