In [1]:
import pandas as pd
import numpy as np
import math

SyntaxError: invalid syntax (__init__.py, line 67)

In [152]:
class Node:
    
    def __init__(self, dataset, targetCol):
        self.p = None
        self.n = None
        self.e = None
        self.total = None
        self.children = []
        self.label= None
        self.gain = None
        self.entropyBefore = None
        self.entropyAfter = None
        self.df = dataset
        self.targetCol = targetCol
        self.currentDf = None # this property used for recursively create the tree
        self.isLeafNode = False
        self.maxAttrVal = None
        
    def initializeNode(self):
        """
            This method assumes that the df and the targetCol properties have been filled in correctly
            It will initialize all the entropy related values of the dataset corresponding to the node
        """
        self.total = self.df[self.targetCol].count()
        self.p = self.df[self.df[self.targetCol] == 'yes'][self.targetCol].count()
        self.n = self.total - self.p
        self.e = self._entropy()
        self.entropyBefore = self.e
        
        if self.entropyBefore == 0:
            self.isLeafNode = True
            print(self.df)
            self.label = self.df[self.targetCol].iloc[0]
        self.currentDf = self.df
        
    def addChild(self, child):
        if self.children == None:
            self.children = [child]
        else:
            self.children.append(child)
    
    
    def info(self):
        print(f' p: {self.p}\n n: {self.n}\n e: {self.e}\n children: {self.children}\n label: {self.label}\n gain: {self.gain}\n entropyBefore: {self.entropyBefore}\n entropyAfter: {self.entropyAfter}\n')

    def _entropy(self):
        
        if self.p == 0 or self.n == 0:
            return 0
    
        frac1 = self.p/(self.p + self.n)
        frac2 = self.n / (self.p + self.n)

        return -frac1 * math.log2(frac1) - frac2 * math.log2(frac2)
    
    def staticEntropy(self, p,n):
        """Static entropy evaluation function"""
        if p == 0 or n == 0:
            return 0
        
        frac1 = p/(p + n)
        frac2 = n / (p + n)
        
        return -frac1 * math.log2(frac1) - frac2 * math.log2(frac2)
    
    def avgInfoEntropy(self,entropies, p, n):
        """Calculate the average info entropy
        params: entropies, the dict of entropies and p, n vals after split on an attribute
                p,n: initial p and n vals before the split
        """
        infoEntropy = 0
        for attributeVal in entropies:
            entropyInfo = entropies[attributeVal]
            ent = entropyInfo['ent']
            p_val = entropyInfo['p']
            n_val = entropyInfo['n']

            frac = (p_val + n_val) / (p + n)
            infoEntropy += frac*ent

        return infoEntropy
    
    def evaluate(self):
        """This method will split the dataset at this node using the ID3 algorithm"""
        
        if self.entropyBefore == 0:
            return # no need to calculate further
        
        gains = dict()
        avgEntropies = dict()
        
        df = self.df
        for attribute in df:
            if attribute == self.targetCol:
                continue
            
            attributeCol = df[attribute]
            attributeVals = attributeCol.unique() # get the set of attribute values
    
            entropies = dict()
            for val in attributeVals:
                splitted = df[df[attribute] == val][[attribute, self.targetCol]]# split the dataset on the condition that attr = val
                total_splitted = splitted[self.targetCol].count()
                p_splitted = (splitted[splitted[self.targetCol] == 'yes'][self.targetCol]).count()
                n_splitted = total_splitted - p_splitted
                entropies[val] = {'ent': self.staticEntropy(p_splitted,n_splitted), 'p': p_splitted, 'n': n_splitted}
                
            
            avgEntropy = self.avgInfoEntropy(entropies, self.p, self.n)
            gain = self.e - avgEntropy
            gains[attribute]=gain
            avgEntropies[attribute] = avgEntropy
        
        maxAttr = max(gains, key=gains.get)
        
        self.label = maxAttr
        self.gain = gains[maxAttr]
        self.entropyAfter = avgEntropies[maxAttr]
        
        maxAttrVals = df[self.label].unique() # unique values that the node's attribute can take

        datasets = {}

        for val in maxAttrVals:

            split_dataset = df[df[self.label] == val].drop(self.label, axis=1) # axis=1 required to drop column
            datasets[val] = split_dataset
            
            node = Node(dataset=split_dataset, targetCol=self.targetCol)
            node.initializeNode()
            node.maxAttrVal = val # the value that split this node: eg Outlook=sunny
            self.addChild(node)
        
    
    def areAllChildrenLeaves(self):
        count = 0
        for child in self.children:
            if child.isLeafNode:
                count += 1
        
        if count == len(self.children):
            return True
        
        return False
    
    def staticRecursiveEvaluate(self, node):
        """Recursively evaluates the given node and all of its children until all nodes are fathomed"""
        node.evaluate()
        
        if node.areAllChildrenLeaves():
            return
        
        for child in node.children:
            if not child.isLeafNode:
                self.staticRecursiveEvaluate(child)

In [153]:
df = pd.read_csv('./datasets/dataset.csv')

In [154]:
df.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes


In [155]:
root = Node(dataset=df, targetCol='play')

In [156]:
root.initializeNode()

In [157]:
root.info()

 p: 9
 n: 5
 e: 0.9402859586706311
 children: []
 label: None
 gain: None
 entropyBefore: 0.9402859586706311
 entropyAfter: None



In [158]:
root.staticRecursiveEvaluate(node=root)

    temp humidity    wind play
2    hot     high    weak  yes
6   cool   normal  strong  yes
11  mild     high  strong  yes
12   hot   normal    weak  yes
   temp    wind play
0   hot    weak   no
1   hot  strong   no
7  mild    weak   no
    temp    wind play
8   cool    weak  yes
10  mild  strong  yes
   temp humidity play
3  mild     high  yes
4  cool   normal  yes
9  mild   normal  yes
    temp humidity play
5   cool   normal   no
13  mild     high   no
