In [71]:
import math
import numpy as np

# Entropy of a coin

e_coin_1 = -(0.5*math.log(0.5,2) + 0.5*math.log(0.5,2))
print(e_coin_1)

1.0


In [72]:
e_coin_2 = -(0.9*math.log(0.9,2) + 0.1*math.log(0.1,2))
print(e_coin_2)

0.4689955935892812


In [73]:
def entropy_bin(prob_a, prob_b):
    return -(prob_a*math.log(prob_a,2) + prob_b*math.log(prob_b,2))

In [74]:
entropy_bin(0.9, 0.1)

0.4689955935892812

In [75]:
def entropy(p):
    H = np.array([-i*np.log2(i) for i in p]).sum()
    return H

In [76]:
p = [.5, .5]
entropy(p)

1.0

In [77]:
tennis = [(5/14), (9/14)]
entropy(tennis)

0.9402859586706311

In [78]:
import pandas as pd

data = pd.read_csv("../datasets/tennis.txt", delimiter="\t", header=None, names=['outlook', 'temperature', 'humidity', 'wind', 'play'])
print(data)

     outlook temperature humidity    wind play
1      Sunny         Hot     High    Weak   No
2      Sunny         Hot     High  Strong   No
3   Overcast         Hot     High    Weak  Yes
4       Rain        Mild     High    Weak  Yes
5       Rain        Cool   Normal    Weak  Yes
6       Rain        Cool   Normal  Strong   No
7   Overcast        Cool   Normal  Strong  Yes
8      Sunny        Mild     High    Weak   No
9      Sunny        Cool   Normal    Weak  Yes
10      Rain        Mild   Normal    Weak  Yes
11     Sunny        Mild   Normal  Strong  Yes
12  Overcast        Mild     High  Strong  Yes
13  Overcast         Hot   Normal    Weak  Yes
14      Rain        Mild     High  Strong   No


In [79]:
data['play'].value_counts()

Yes    9
No     5
Name: play, dtype: int64

In [80]:
data[data['wind'] == 'Weak']['play'].value_counts()

Yes    6
No     2
Name: play, dtype: int64

In [81]:
entropy([(6/8), (2/8)])

0.8112781244591328

In [82]:
entropy([(9/14), (5/14)])

0.9402859586706311

In [83]:
data[data['wind'] == 'Strong']['play'].value_counts()

No     3
Yes    3
Name: play, dtype: int64

In [84]:
data['wind'].value_counts()

Weak      8
Strong    6
Name: wind, dtype: int64

In [85]:
# Information gained = Entropy of the decision - prob(weak wind) * entropy(decision given weak wind) - prob(strong wind) * entropy(decision given strong wind)

entropy([(9/14), (5/14)]) - (8/14)*entropy([(6/8), (2/8)]) - (6/14)*1

0.048127030408269544

In [99]:
# The root of the decision tree will be the column where the most information is gained

def info_gain(df, feature, decision):
    conditions = []
    total = df[feature].count()
    value_counts = df[feature].value_counts()
    for index, count in enumerate(value_counts):
        prob = count / total
        entropy_counts = df[df[feature] == value_counts.index[index]][decision].value_counts()
        entropy_values = [(x / entropy_counts.sum()) for x in entropy_counts]
        dec_entropy = np.array([-i*np.log2(i) for i in entropy_values]).sum()
        conditions.append([prob, dec_entropy])
    decision_values = df[decision].value_counts()
    decision_entropy = entropy([(decision_values[0] / total), (decision_values[1] / total)])
    info_gain = decision_entropy
    for prob, dec_entropy in conditions:
        info_gain -= prob * dec_entropy
    return info_gain

In [100]:
wind = data['wind'].value_counts()
wind.index[0]

'Weak'

In [101]:
info_gain(data, 'wind', 'play')

0.048127030408269544

In [102]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.tree import export_graphviz
import pydotplus

data_encoded = data.apply(preprocessing.LabelEncoder().fit_transform)
print(data_encoded)

#
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

# one_hot_data = pd.get_dummies(data[['a', 'b', 'c', 'd']], drop_first=True)
# print(one_hot_data)
clf.fit(data_encoded[['outlook', 'temperature', 'humidity', 'wind']], data_encoded['play'])


dot_data = export_graphviz(clf, out_file=None, feature_names=['Outlook', 'Temp.', 'Humidity', 'Wind'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('tennis_tree.png')

    outlook  temperature  humidity  wind  play
1         2            1         0     1     0
2         2            1         0     0     0
3         0            1         0     1     1
4         1            2         0     1     1
5         1            0         1     1     1
6         1            0         1     0     0
7         0            0         1     0     1
8         2            2         0     1     0
9         2            0         1     1     1
10        1            2         1     1     1
11        2            2         1     0     1
12        0            2         0     0     1
13        0            1         1     1     1
14        1            2         0     0     0


True