### Decision Tree on Weather Data

Decision Tree works for both categorical and contineous data.

In [1]:
from pprint import pprint
from math import log
import numpy as np

In [2]:
# outlook, Temperature, Humidity, Wind, Playd Football(yes/no)
training_data= [
    ['Sunny', 'Hot', 'High', 'Weak', 'No'],
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
    ['Sunny', 'Mild', 'High', 'Weak', 'No'],
    ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
    ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Strong', 'No']
]

pprint(training_data)

[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
 ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]


In [3]:
def count_class_freq(rows):
    #last column is the class
    classes={} #dictionary
    for row in rows:
        c=row[-1]
        if c not in classes:
            classes[c]=1
        else:
            classes[c]+=1
    return classes

In [4]:
count_class_freq(training_data)

{'No': 5, 'Yes': 9}

#### Impurity
1. How much data is mixed up.
2. for two class if one class present, impurity=0
3. if two class present, impurity=0.5


#### Information Gain
ig= current impurity - ( split into groups, weighted sum of impurity of the groups)

#### Gini index and Entropy
Decision tree algorithms use information gain to split a node. Gini index or entropy is the criterion for calculating information gain. 
Both gini and entropy are measures of impurity of a node. A node having multiple classes is impure whereas a node having only one class is pure.  Entropy in statistics is analogous to entropy in thermodynamics where it signifies disorder. If there are multiple classes in a node, there is disorder in that node. 
 
Information gain is the entropy of parent node minus sum of weighted entropies of child nodes. 
 Weight of a child node is number of samples in the node/total samples of all child nodes. Similarly information gain is calculated with gini score. 
 <img src='imgs/ginientropy.jpg'>
 #### Entropy vs gini
<img src='imgs/entropy_vs_gini.png' width=40%>

In [5]:
def gini(rows):
    classes=count_class_freq(rows)
    impurity = 1
    for c in classes:
        prob_of_c = classes[c] / float(len(rows))
        impurity -= prob_of_c**2
    return impurity

In [6]:
def entropy(rows):
    classes=count_class_freq(rows)
    impurity = 0
    for c in classes:
        prob_of_c = classes[c] / float(len(rows))
        impurity -= prob_of_c*  log(prob_of_c, 2)  #2 base log.
    return impurity

In [7]:
gini(training_data)

0.4591836734693877

In [8]:
entropy(training_data)

0.9402859586706309

In [9]:
npd=np.array(training_data)
vls=np.unique( npd[:,0] )
print(vls)

['Overcast' 'Rain' 'Sunny']


In [10]:
def split(rows, col):
    ds={}
    for row in rows:
        cv=row[col]
        if cv in ds:
            ds[cv].append(row)
        else:
            ds[cv]=[row]
    return ds

In [11]:
#Find Entropy of the 1 set data (main data) Parent Entropy
es=entropy(training_data)
print(es)

0.9402859586706309


In [1]:
# for groupname,rows in sps.items():
#     print(groupname)

In [13]:
def total_rows(groups):
    nr=0
    for groupname,rows in sps.items():
        nr+=len(rows)
    return nr

In [2]:
# print(total_rows(sps))

In [15]:
#Information gain for split. sum of weighted entropy.
def ig(groups, current_entropy): 
    groups_entropy=0
    for groupnam, rows in groups.items():
        group_entropy=entropy(rows)
#         print('group: ', groupnam, ' et=', group_entropy)
        weighted_entropy= ( len(rows)/total_rows(groups) )*group_entropy
        groups_entropy+=weighted_entropy
    return current_entropy-groups_entropy

In [16]:
#Split data based on column1 (Outlook)

sps=split(training_data, 0)
wtv=0  #weighted average.
for sp in sps:
    dt=sps[sp]
    et=entropy(dt)
    sw= (len(dt)/len(training_data) )*et  #weighted entropy. or probability*entropy
    wtv+=sw
    print('Group: ', sp, ' Total=', len(dt), ' et=', et)
    pprint(dt)
print('weighted entropy: ', wtv)

Group:  Sunny  Total= 5  et= 0.9709505944546686
[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']]
Group:  Overcast  Total= 4  et= 0.0
[['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes']]
Group:  Rain  Total= 5  et= 0.9709505944546686
[['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]
weighted entropy:  0.6935361388961918


In [17]:
#Outlook, col=0
sps=split(training_data, 0)
ig(sps, es)

0.2467498197744391

In [18]:
#Temp, col=1
sps=split(training_data, 1)
ig(sps, es)

0.029222565658954647

In [16]:
#Humidity col=2
sps=split(training_data, 2)
ig(sps, es)

0.15183550136234136

In [17]:
#Wind col=3
sps=split(training_data, 3)
ig(sps, es)

0.04812703040826927

In [18]:
#Largest Entropy Gain is for column 0 (Outlook)
groups=split(training_data, 0)
for groupname, rows in groups.items():
    print('Group: ', groupname, ' entropy: ', entropy(rows))
    pprint(rows)

Group:  Sunny  entropy:  0.9709505944546686
[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']]
Group:  Overcast  entropy:  0.0
[['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes']]
Group:  Rain  entropy:  0.9709505944546686
[['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]


#### A branch with entropy 0 is a leaf node.
#### A branch with entropy >0 needs further splitting

#### Fro branch Sunny

In [19]:
sunny=groups['Sunny']
rain=groups['Rain']

In [20]:
pprint(sunny)

[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']]


In [21]:
pprint(rain)

[['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]


In [22]:
#Now for branch Sunny

pes=entropy(sunny)
print('Sunny Parent Entropy: ', pes)

Sunny Parent Entropy:  0.9709505944546686


In [34]:
#Now calculate IG for the remaining features/columns
sunny_groups=split(sunny, 1)
for gn,rows in sunny_groups.items():
    print(gn)
    pprint(rows)

Hot
[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No']]
Mild
[['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']]
Cool
[['Sunny', 'Cool', 'Normal', 'Weak', 'Yes']]


In [35]:
sps=split(sunny, 1)
wtv=0  #weighted average.
for sp in sps:
    dt=sps[sp]
    et=entropy(dt)
    sw= (len(dt)/len(sunny) )*et  #weighted entropy. or probability*entropy
    wtv+=sw
    print('Group: ', sp, ' Total=', len(dt), ' et=', et)
    pprint(dt)
print('weighted entropy: ', wtv)

Group:  Hot  Total= 2  et= 0.0
[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No']]
Group:  Mild  Total= 2  et= 1.0
[['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']]
Group:  Cool  Total= 1  et= 0.0
[['Sunny', 'Cool', 'Normal', 'Weak', 'Yes']]
weighted entropy:  0.4


In [112]:
pes-wtv

0.5709505944546686

In [23]:
#Temp, col1
sunny_groups=split(sunny, 1)
igs=ig(sunny_groups, pes)
print('Temp. igs=',igs)

#Humidity, col2
sunny_groups=split(sunny, 2)
igs=ig(sunny_groups, pes)
print('Humidity. igs=',igs)

#Windy, col3
sunny_groups=split(sunny, 3)
igs=ig(sunny_groups, pes)
print('Wind. igs=',igs)

Temp. igs= 0.8280934515975258
Humidity. igs= 0.9709505944546686
Wind. igs= 0.6313157728715637


### From this we get Humidity has max information gain

In [24]:
#Largest Entropy Gain is for column 2 (Humidity)
groups=split(sunny, 2)
for groupname, rows in groups.items():
    print('Group: ', groupname, ' entropy: ', entropy(rows))
    pprint(rows)

Group:  High  entropy:  0.0
[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No']]
Group:  Normal  entropy:  0.0
[['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']]


#### Here both are Leaf nodes. 

#### Now for branch Rain

In [25]:
pes=entropy(rain)
print('Rain Parent Entropy: ', pes)

Rain Parent Entropy:  0.9709505944546686


In [26]:
#Temp, col1
rain_groups=split(rain, 1)
igs=ig(rain_groups, pes)
print('Temp. igs=',igs)

#Humidity, col2
rain_groups=split(rain, 2)
igs=ig(rain_groups, pes)
print('Humidity. igs=',igs)

#Wind, col3
rain_groups=split(rain, 3)
igs=ig(rain_groups, pes)
print('Wind. igs=',igs)

Temp. igs= 0.6313157728715637
Humidity. igs= 0.6313157728715637
Wind. igs= 0.9709505944546686


#### From this we get Wind has maximum information gain

In [27]:
#Largest Entropy Gain is for column 3 (Wind)
groups=split(rain, 3)
for groupname, rows in groups.items():
    print('Group: ', groupname, ' entropy: ', entropy(rows))
    pprint(rows)

Group:  Weak  entropy:  0.0
[['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes']]
Group:  Strong  entropy:  0.0
[['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]


#### Here both are leaf nodes.