In [14]:
import pandas as pd
import math
import numpy as np

In [15]:
df = pd.read_csv('./datasets/dataset.csv')

In [16]:
df.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes


In [17]:
total = df['play'].count()

In [18]:
total

14

In [19]:
# the number of positive examples
p = df[df.play == 'yes']['play'].count()

In [20]:
# the number of negative examples
n = total - p

In [21]:
n

5

In [22]:
def entropy(p, n):
    if p == 0 or n == 0:
        return 0
    frac1 = p/(p + n)
    frac2 = n / (p + n)
    return -frac1 * math.log2(frac1) - frac2 * math.log2(frac2)

In [23]:
e = entropy(p,n)
e

0.9402859586706311

In [24]:
gains = dict()
avgEntropies = dict()

for attribute in df:
    if attribute == 'play':
        continue
    
    attributeCol = df[attribute]
    attributeVals = attributeCol.unique() # get the set of attribute values
    
    entropies = dict()
    for val in attributeVals:
        
        splitted = df[df[attribute] == val][[attribute, 'play']]# split the dataset on the condition that attr = val
        total_splitted = splitted['play'].count()
        p_splitted = (splitted[splitted['play'] == 'yes']['play']).count()
        n_splitted = total_splitted - p_splitted
        entropies[val] = {'ent': entropy(p_splitted,n_splitted), 'p': p_splitted, 'n': n_splitted}
    
    
    avgEntropy = avgInfoEntropy(entropies, p, n) # p, n, e global
    gain = e - avgEntropy
    gains[attribute]=gain
    avgEntropies[attribute] = avgEntropy

print('Information gain:')
print(gains)

maxAttr = max(gains, key=gains.get)
print(f'Selected: \n {maxAttr}')

root = Node()
root.label = maxAttr
root.gain = gains[maxAttr]
root.entropyBefore = e
root.entropyAfter = avgEntropies[maxAttr]

root.info()

Information gain:
{'outlook': 0.24674981977443933, 'temp': 0.02922256565895487, 'humidity': 0.15183550136234159, 'wind': 0.04812703040826949}
Selected: 
 outlook
 p: None
 n: None
 e: None
 children: None
 label: outlook
 gain: 0.24674981977443933
 entropyBefore: 0.9402859586706311
 entropyAfter: 0.6935361388961918



In [25]:
def avgInfoEntropy(entropies, p, n):
    """Calculate the average info entropy
    params: entropies, the dict of entropies and p, n vals after split on an attribute
            p,n: initial p and n vals before the split
            """
    infoEntropy = 0
    for attributeVal in entropies:
        entropyInfo = entropies[attributeVal]
        ent = entropyInfo['ent']
        p_val = entropyInfo['p']
        n_val = entropyInfo['n']
        
        frac = (p_val + n_val) / (p + n)
        infoEntropy += frac*ent
    
    return infoEntropy

In [26]:
class Node:
    
    def __init__(self):
        self.p = None
        self.n = None
        self.e = None
        self.children = None
        self.label= None
        self.gain = None
        self.entropyBefore = None
        self.entropyAfter = None
        
    def addChild(self, child):
        if self.children == None:
            self.children = [child]
        else:
            self.children.append(child)
    
    
    def info(self):
        print(f' p: {self.p}\n n: {self.n}\n e: {self.e}\n children: {self.children}\n label: {self.label}\n gain: {self.gain}\n entropyBefore: {self.entropyBefore}\n entropyAfter: {self.entropyAfter}\n')


In [32]:
# now, I need to split the dataset at the root
maxAttrVals = df[root.label].unique() # unique values that the root node's attribute can take

datasets = {}

for val in maxAttrVals:
    
    split_dataset = df[df[root.label] == val].drop(root.label, axis=1)
    datasets[val] = split_dataset
    print(split_dataset)
    # now for each dataset perform all the steps... need to refactor here
    

    temp humidity    wind play
0    hot     high    weak   no
1    hot     high  strong   no
7   mild     high    weak   no
8   cool   normal    weak  yes
10  mild   normal  strong  yes
    temp humidity    wind play
2    hot     high    weak  yes
6   cool   normal  strong  yes
11  mild     high  strong  yes
12   hot   normal    weak  yes
    temp humidity    wind play
3   mild     high    weak  yes
4   cool   normal    weak  yes
5   cool   normal  strong   no
9   mild   normal    weak  yes
13  mild     high  strong   no
