In [1]:
import numpy as np
import pandas as pd
from numpy import log2 as log
eps = np.finfo(float).eps

In [2]:
data = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy','Salty'],
       'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],
       'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],
       'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}
df = pd.DataFrame(data, columns=['Taste','Temperature','Texture','Eat'])
df        

Unnamed: 0,Taste,Temperature,Texture,Eat
0,Salty,Hot,Soft,No
1,Spicy,Hot,Soft,No
2,Spicy,Hot,Hard,Yes
3,Spicy,Cold,Hard,No
4,Spicy,Hot,Hard,Yes
5,Sweet,Cold,Soft,Yes
6,Salty,Cold,Soft,No
7,Sweet,Hot,Soft,Yes
8,Spicy,Cold,Soft,Yes
9,Salty,Hot,Hard,Yes


In [3]:
info = 0.0
values = df.Eat.unique()
target_vals = df.Eat.unique()
for value in values:
    prob = df.Eat.value_counts()[value]/len(df)
    info += -prob*np.log2(prob)
info

0.9709505944546686

In [4]:
cols = ['Taste','Temperature','Texture']
for each_col in cols:
    entropy_att=0
    classes = df[each_col].unique()
    print(classes)

['Salty' 'Spicy' 'Sweet']
['Hot' 'Cold']
['Soft' 'Hard']


In [5]:
#First Level
entropy_attributes = {}
gain_each = {}
for each_col in cols:
    entropy_att=0
    classes = df[each_col].unique()
    
    for each_class in classes:
        entropy_each_class = 0
        
        for target_val in target_vals:
            numerator = len(df[each_col][df[each_col]==each_class][df.Eat ==target_val])
            denominator = len(df[each_col][df[each_col]==each_class])
            prob = numerator/(denominator+eps)
            entropy_each_class += abs(-prob*np.log2(prob+eps))
            
        prob_outer = denominator/len(df)
        entropy_att += abs(-prob_outer*entropy_each_class)
        
    entropy_attributes[each_col] = entropy_att
    gain_each[each_col] = info - entropy_att
print(entropy_attributes)
print(gain_each)    

{'Taste': 0.7609640474436808, 'Temperature': 0.950977500432693, 'Texture': 0.9245112497836524}
{'Taste': 0.20998654701098773, 'Temperature': 0.019973094021975557, 'Texture': 0.04643934467101618}


In [6]:
def find_entropy(df):
    Class = df.keys()[-1]
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

In [7]:
def find_entropy_attribute(df,attribute):
  Class = df.keys()[-1]
  target_variables = df[Class].unique()
  variables = df[attribute].unique()
  entropy2 = 0
  for variable in variables:
      entropy = 0
      for target_variable in target_variables:
          num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
          den = len(df[attribute][df[attribute]==variable])
          fraction = num/(den+eps)
          entropy += -fraction*log(fraction+eps)
      fraction2 = den/len(df)
      entropy2 += -fraction2*entropy
  return abs(entropy2)

In [8]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
#       Entropy_att.append(find_entropy_attribute(df,key))
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [9]:
def get_subtable(df, node,value):
  return df[df[node] == value].reset_index(drop=True)

In [10]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]    
    #Start building the decision tree
    node = find_winner(df)#attribute with maxm info_gain
    attValue = np.unique(df[node])
    if tree is None:                    
        tree={}
        tree[node] = {}
    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['Eat'],return_counts=True)                        
        
        if len(counts)==1:
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable)
                   
    return tree

In [11]:
#Build Tree
import pprint
tree = buildTree(df)
pprint.pprint(tree)

{'Taste': {'Salty': {'Texture': {'Hard': 'Yes', 'Soft': 'No'}},
           'Spicy': {'Temperature': {'Cold': {'Texture': {'Hard': 'No',
                                                          'Soft': 'Yes'}},
                                     'Hot': {'Texture': {'Hard': 'Yes',
                                                         'Soft': 'No'}}}},
           'Sweet': 'Yes'}}


In [12]:
#Predction function
def predict(inst,tree):
    for nodes in tree.keys():        
        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

In [13]:
#Testing
test_data = pd.Series({'Taste':'Salty','Temperature':'Cold','Texture':'Hard'})
test_data

Taste          Salty
Temperature     Cold
Texture         Hard
dtype: object

In [14]:
#Predict
pred = predict(test_data,tree)
pred

'Yes'