In [None]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log

In [None]:
df = pd.read_csv("/content/data_jobs.csv")
df

Unnamed: 0,Age,Salary,Professional,Computer_Skill,Offer_Job
0,Young,High,No,Fair,No
1,Young,High,No,Excellent,No
2,Middle,High,No,Fair,Yes
3,Old,Medium,No,Fair,Yes
4,Old,Low,Yes,Fair,Yes
5,Old,Low,Yes,Excellent,No
6,Middle,Low,Yes,Excellent,Yes
7,Young,Medium,No,Fair,No
8,Young,Low,Yes,Fair,Yes
9,Old,Medium,Yes,Fair,Yes


In [None]:
def id3_model(df):
  entropy_node = 0  
  values = df.Offer_Job.unique()  #Unique objects - 'Yes', 'No'
  for value in values:
    f = df.Offer_Job.value_counts()[value]/len(df.Offer_Job)  
    entropy_node += -f*np.log2(f) 
  entropy_node

  a_entropy = {k:ent(df,k) for k in df.keys()[:-1]}
  a_entropy

  Info_gain = {k:gain(entropy_node,a_entropy[k]) for k in a_entropy}

In [None]:
def ent(df,attribute):
    target_variables = df.Offer_Job.unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute 


    entropy_attribute = 0
    for variable in variables:
        entropy_each_feature = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df.Offer_Job ==target_variable]) 
            den = len(df[attribute][df[attribute]==variable])  
            fraction = num/(den+eps)  
            entropy_each_feature += -fraction*log(fraction+eps) 
        fraction2 = den/len(df)
        entropy_attribute += -fraction2*entropy_each_feature   

    return(abs(entropy_attribute))

In [None]:
def gain(e_dataset,e_attr):
    return(e_dataset-e_attr)

In [None]:
def find_entropy(df):
    Class = df.keys()[-1]   
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy
  
  
def entropy_attribute(df,attribute):
  Class = df.keys()[-1]  
  target_variables = df[Class].unique()  #This gives all 'Yes' and 'No'
  variables = df[attribute].unique()    
  entropy2 = 0
  for variable in variables:
      entropy = 0
      for target_variable in target_variables:
          num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
          den = len(df[attribute][df[attribute]==variable])
          fraction = num/(den+eps)
          entropy += -fraction*log(fraction+eps)
      fraction2 = den/len(df)
      entropy2 += -fraction2*entropy
  return abs(entropy2)


def find_winner(df):
    Entropy_att = []
    Info_gain = []
    for key in df.keys()[:-1]:
        Info_gain.append(find_entropy(df)-entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(Info_gain)]
  
  
def subtree(df, node,value):
  return df[df[node] == value].reset_index(drop=True)


def buildDT(df,tree=None): 
    Class = df.keys()[-1]   
    node = find_winner(df)
    attValue = np.unique(df[node])
    if tree is None:                    
        tree={}
        tree[node] = {}

    for value in attValue:
        
        subtable = subtree(df,node,value)
        clValue,counts = np.unique(subtable['Offer_Job'],return_counts=True)                        
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildDT(subtable) #Calling the function recursively 
                   
    return tree

In [None]:
id3_model(df)

In [None]:
t = buildDT(df)

In [None]:
import pprint 
pprint.pprint(t)

{'Age': {'Middle': 'Yes',
         'Old': {'Computer_Skill': {'Excellent': 'No', 'Fair': 'Yes'}},
         'Young': {'Professional ': {'No': 'No', 'Yes': 'Yes'}}}}


In [None]:
import pydot
def makeGraph(parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)

def visit(node, parent=None):
    for k,v in node.items():
        if isinstance(v, dict):
            if parent:
                makeGraph(parent, k)
            visit(v, k)
        else:
            makeGraph(parent, k)
            makeGraph(k, v)

graph = pydot.Dot(graph_type='graph')
visit(t)
graph.write_png('id3_decision_tree.png')

In [None]:
def predict(tree, instance):
    if not isinstance(tree, dict): 
        return tree 
    else:
        root_node = next(iter(tree)) 
        feature_value = instance[root_node] 
        if feature_value in tree[root_node]: 
            return predict(tree[root_node][feature_value], instance) 
        else:
            return None

In [None]:
def evaluate(tree, test_data_m, label):
    true = 0
    false = 0
    i=0
    for row in test_data_m.iterrows(): 
        result = predict(tree, test_data_m.iloc[i]) 
        if result == test_data_m[label].iloc[i]:
            true += 1
        else:
            false += 1 
        i+=1
    accuracy = true / (true + false) #calculating accuracy
    return accuracy

In [None]:
test = df.sample(frac=0.25, random_state=25)
evaluate(t, test, 'Offer_Job')

1.0

In [None]:
test

Unnamed: 0,Age,Salary,Professional,Computer_Skill,Offer_Job
0,Young,High,No,Fair,No
9,Old,Medium,Yes,Fair,Yes
3,Old,Medium,No,Fair,Yes
5,Old,Low,Yes,Excellent,No


In [None]:
id3_model(test)
test_tree = buildDT(test)

In [None]:
pprint.pprint(test_tree)

{'Salary': {'High': 'No', 'Low': 'No', 'Medium ': 'Yes'}}


In [None]:
graph = pydot.Dot(graph_type='graph')
visit(test_tree)
graph.write_png('id3_test_decision_tree.png')