In [55]:
import numpy as np
import pandas as pd
from numpy import log2 as log

In [56]:
eps = np.finfo(float).eps

In [57]:
dataset = {
'Id':[1,2,3,4,5,6,7,8,9,10,11,12,13,14],
'Age':['<21','<21','21-35','>35','>35','>35','21-35','<21','<21','>35','<21','21-35','21-35','>35'],
'Income':['High','High','High','Medium','Low','Low','Low','Medium','Low','Medium','Medium','Medium','High','Medium'],
'Gender':['Male','Male','Male','Male','Female','Female','Female','Male','Female','Female','Female','Male','Female','Male'],
'MaritalStatus':['Single','Married','Single','Single','Single','Married','Married','Single','Married','Single','Married','Married','Single','Married'],
'Buys':['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']}

In [58]:
df = pd.DataFrame(dataset,columns=['Id','Age','Income','Gender','MaritalStatus','Buys'])

In [37]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
x = data.iloc[:, :-1]
x = x.apply(lb.fit_transform)
print("age encodd value :",list(zip(data.iloc[:,0], x.iloc[:,0])))
print("\nIncome encodd value :",list(zip(data.iloc[:,1], x.iloc[:,1])))
print("\nGender encodd value :",list(zip(data.iloc[:,2], x.iloc[:,2])))
print("\nmarital status encodd value :",list(zip(data.iloc[:,3], x.iloc[:,3])))

age encodd value : [('<21', 1), ('<21', 1), ('21-35', 0), ('>35', 2), ('>35', 2), ('>35', 2), ('21-35', 0), ('<21', 1), ('<21', 1), ('>35', 2), ('<21', 1), ('21-35', 0), ('21-35', 0), ('>35', 2)]

Income encodd value : [('High', 0), ('High', 0), ('High', 0), ('Medium', 2), ('Low', 1), ('Low', 1), ('Low', 1), ('Medium', 2), ('Low', 1), ('Medium', 2), ('Medium', 2), ('Medium', 2), ('High', 0), ('Medium', 2)]

Gender encodd value : [('Male', 1), ('Male', 1), ('Male', 1), ('Male', 1), ('Female', 0), ('Female', 0), ('Female', 0), ('Male', 1), ('Female', 0), ('Female', 0), ('Female', 0), ('Male', 1), ('Female', 0), ('Male', 1)]

marital status encodd value : [('Single', 1), ('Married', 0), ('Single', 1), ('Single', 1), ('Single', 1), ('Married', 0), ('Married', 0), ('Single', 1), ('Married', 0), ('Single', 1), ('Married', 0), ('Married', 0), ('Single', 1), ('Married', 0)]


In [59]:
df

Unnamed: 0,Id,Age,Income,Gender,MaritalStatus,Buys
0,1,<21,High,Male,Single,No
1,2,<21,High,Male,Married,No
2,3,21-35,High,Male,Single,Yes
3,4,>35,Medium,Male,Single,Yes
4,5,>35,Low,Female,Single,Yes
5,6,>35,Low,Female,Married,No
6,7,21-35,Low,Female,Married,Yes
7,8,<21,Medium,Male,Single,No
8,9,<21,Low,Female,Married,Yes
9,10,>35,Medium,Female,Single,Yes


In [60]:
df = df.drop('Id',axis=1)
df

Unnamed: 0,Age,Income,Gender,MaritalStatus,Buys
0,<21,High,Male,Single,No
1,<21,High,Male,Married,No
2,21-35,High,Male,Single,Yes
3,>35,Medium,Male,Single,Yes
4,>35,Low,Female,Single,Yes
5,>35,Low,Female,Married,No
6,21-35,Low,Female,Married,Yes
7,<21,Medium,Male,Single,No
8,<21,Low,Female,Married,Yes
9,>35,Medium,Female,Single,Yes


In [61]:
def find_entropy(df):
    Class = df.keys()[-1] 
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

In [62]:
def find_entropy_attribute(df,attribute):
    Class = df.keys()[-1]   
    target_variables = df[Class].unique()  
    variables = df[attribute].unique()    
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2)

In [63]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [64]:
def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)

In [65]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]   
    node = find_winner(df)
    attValue = np.unique(df[node])
    
    if tree is None:                    
        tree={}
        tree[node] = {}

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['Buys'],return_counts=True)                        
        
        if len(counts)==1:
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable)  
                   
    return tree

In [66]:
tree = buildTree(df)

In [67]:
tree

{'Age': {'21-35': 'Yes',
  '<21': {'Gender': {'Female': 'Yes', 'Male': 'No'}},
  '>35': {'MaritalStatus': {'Married': 'No', 'Single': 'Yes'}}}}

In [68]:
def predict(inst,tree):
    for nodes in tree.keys():        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0     
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                                  
    return prediction

In [69]:
inst = df.iloc[6]

In [70]:
inst

Age                21-35
Income               Low
Gender            Female
MaritalStatus    Married
Buys                 Yes
Name: 6, dtype: object

In [71]:
Prediction = predict(inst,tree)

In [72]:
Prediction

'Yes'

In [73]:
data = {'Age':'<21','Income':'Low','Gender':'Female','MaritalStatus':'Married'}

In [74]:
data

{'Age': '<21', 'Income': 'Low', 'Gender': 'Female', 'MaritalStatus': 'Married'}

In [75]:
NewPrediction = predict(data,tree)

In [76]:
NewPrediction

'Yes'