In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
eps = np.finfo(float).eps
from numpy import log2 as log

In [2]:
data = pd.read_csv("DecisionTree.csv")
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [3]:
def find_entropy(df):
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy


In [4]:
def find_entropy_attribute(df,attribute):
    Class = df.keys()[-1] 
    target_variables = df[Class].unique() 
    variables = df[attribute].unique()
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2)

In [5]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [6]:
def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)

In [7]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]
    
    #Get attribute with maximum information gain
    node = find_winner(df)
    attValue = np.unique(df[node])

    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}

    for value in attValue:
        subtable = get_subtable(df,node,value)
        print(subtable)
        clValue,counts = np.unique(subtable['Play'],return_counts=True)
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable) #Calling the function recursively 

    return tree
  

In [8]:
tree = buildTree(data)

    Outlook Temperature Humidity    Wind Play
0  Overcast         Hot     High    Weak  Yes
1  Overcast        Cool   Normal  Strong  Yes
2  Overcast        Mild     High  Strong  Yes
3  Overcast         Hot   Normal    Weak  Yes
  Outlook Temperature Humidity    Wind Play
0    Rain        Mild     High    Weak  Yes
1    Rain        Cool   Normal    Weak  Yes
2    Rain        Cool   Normal  Strong   No
3    Rain        Mild   Normal    Weak  Yes
4    Rain        Mild     High  Strong   No
  Outlook Temperature Humidity    Wind Play
0    Rain        Cool   Normal  Strong   No
1    Rain        Mild     High  Strong   No
  Outlook Temperature Humidity  Wind Play
0    Rain        Mild     High  Weak  Yes
1    Rain        Cool   Normal  Weak  Yes
2    Rain        Mild   Normal  Weak  Yes
  Outlook Temperature Humidity    Wind Play
0   Sunny         Hot     High    Weak   No
1   Sunny         Hot     High  Strong   No
2   Sunny        Mild     High    Weak   No
3   Sunny        Cool   Normal

In [9]:
import pprint
pprint.pprint(tree)

{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [10]:
def predict(test, tree, default=None):
    attribute = next(iter(tree))
    print(attribute)

    if test[attribute] in tree[attribute].keys():
        print(tree[attribute].keys())
        print(test[attribute])
        result = tree[attribute][test[attribute]]
        if isinstance(result, dict):
            return predict(test, result)
        else:
            return result
    else:
        return default


In [11]:

test = {'Outlook':'Sunny', 'Temperature':'Mild', 'Humidity':'Normal', 'Wind':'Strong'}
ans = predict(test, tree)
print(ans)

Outlook
dict_keys(['Overcast', 'Rain', 'Sunny'])
Sunny
Humidity
dict_keys(['High', 'Normal'])
Normal
Yes
