In [4]:
import numpy as np
import pandas as pd

## How to determine the root of the tree?
* Conditional Probability
* Entropy
* Informational gain 

## Topics Covered:
* Calculating entropy
* Determining Root of Decision through Mutual Information/Information Gain

### Entropy is the measure of uncertainty

In [1]:

def entropy(probabilities):
    total = np.array([-i*np.log2(i) for i in probabilities]).sum()
    return total


p = [.5, .5]
entropy(p)

1.0

In [2]:
def entropy(arr):
    total = []
    for num in arr:
        total.append(num*np.log2(num))
        
    return (-1 * (sum(total)))

In [12]:
df = pd.read_csv('../data/tennis.txt', delimiter="\t", header=None, names=['weather', 'temp', 'humidity', 'wind', 'play'])

In [13]:
df.head()

Unnamed: 0,weather,temp,humidity,wind,play
1,Sunny,Hot,High,Weak,No
2,Sunny,Hot,High,Strong,No
3,Overcast,Hot,High,Weak,Yes
4,Rain,Mild,High,Weak,Yes
5,Rain,Cool,Normal,Weak,Yes


In [14]:
print(entropy([5/14, 9/14]))

0.9402859586706311


In [37]:
wind_weak_prob = len(df[df["wind"] == "Weak"]) / len(df[df["play"] == "Yes"])
wind_weak_prob

0.8888888888888888

In [38]:
wind_strong_prob = len(df[df["wind"] == "Strong"]) / len(df[df["play"] == "Yes"])
wind_strong_prob

0.6666666666666666

In [39]:
print(entropy([2/8, 6/8]))

0.8112781244591328


In [40]:
print(entropy([.5, .5]))

1.0


In [41]:
prob_of_wind_weak = len(df[df["wind"] == "Weak"])/len(df["wind"])
prob_of_wind_weak

0.5714285714285714

In [42]:
8/14

0.5714285714285714

## Mutual Information or Information Gain

In [44]:
information_gain = entropy([2/8, 6/8]) - 0.81 * 8/14 + 1.0 * 6/14
information_gain

0.7769924101734185

## The feature with the highest information gain is OUR ROOT

In [52]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

In [53]:
data = pd.read_csv('../data/tennis.txt', delimiter="\t", header=None, names=['weather', 'temp', 'humidity', 'wind', 'play'])
data_encoded = data.apply(preprocessing.LabelEncoder().fit_transform)
print(data_encoded)

    weather  temp  humidity  wind  play
1         2     1         0     1     0
2         2     1         0     0     0
3         0     1         0     1     1
4         1     2         0     1     1
5         1     0         1     1     1
6         1     0         1     0     0
7         0     0         1     0     1
8         2     2         0     1     0
9         2     0         1     1     1
10        1     2         1     1     1
11        2     2         1     0     1
12        0     2         0     0     1
13        0     1         1     1     1
14        1     2         0     0     0


In [57]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

In [60]:
clf.fit(data_encoded[['weather', 'temp', 'humidity', 'wind']], data_encoded['play'])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')