# ID3 Algorithm

Program to demonstrate the working of the decision tree based ID3 algorithm.

In [4]:
import math
import pandas as pd
from pandas import DataFrame

df_tennis = pd.read_csv('data.csv')
print('Dataset:')
print(df_tennis)

def entropy(probs):
    return sum([-prob*math.log(prob,2) for prob in probs])

def entropy_of_list(a_list):
    from collections import Counter
    cnt = Counter(x for x in a_list)
    print('\nYES and No classes: ', a_list.name,cnt)
    num_instances = len(a_list)*1.0
    probs = [x/num_instances for x in cnt.values()]
    return entropy(probs)

total_entropy = entropy_of_list(df_tennis['Target'])
print('Entropy of given Play Tennis dataset: ', total_entropy, '\n')

def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print('Information Gain calculation of ', split_attribute_name)
    df_split = df.groupby(split_attribute_name)
    for name, group in df_split:
        print(name)
        print(group)
    nobs = len(df.index)*1.0
    df_agg_ent = df_split.agg({target_attribute_name: [entropy_of_list, lambda x: len(x)/nobs]})[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    new_entropy = sum(df_agg_ent['Entropy']*df_agg_ent['PropObservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    return (old_entropy-new_entropy)

print('Info-Gain for Outlook is: '+str(information_gain(df_tennis,'Outlook','Target')),'\n')
print('')
print('Info-Gain for Humidity is: '+str(information_gain(df_tennis,'Humidity','Target')),'\n')
print('')
print('Info-Gain for Wind is: '+str(information_gain(df_tennis,'Wind ','Target')),'\n')
print('')
print('Info-Gain for Temperature is: '+str(information_gain(df_tennis,'Temperature','Target')),'\n')
print('')

Dataset:
     Outlook Temperature Humidity   Wind  Target
0      sunny         hot     high    weak     no
1      sunny         hot     high  strong     no
2   overcast         hot     high    weak    yes
3       rain        mild   normal    weak    yes
4       rain        cool   normal    weak     no
5       rain        cool   normal  strong    yes
6   overcast        cool   normal  strong     no
7      sunny        mild     high  strong    yes
8      sunny        cool   normal    weak     no
9       rain        mild   normal    weak    yes
10     sunny        mild   normal    weak    yes
11     sunny        mild   normal  strong    yes
12  overcast        mild     high  strong    yes
13  overcast         hot   normal    weak    yes
14      rain        mild     high  strong     no

YES and No classes:  Target Counter({'yes': 9, 'no': 6})
Entropy of given Play Tennis dataset:  0.9709505944546686 

Information Gain calculation of  Outlook
overcast
     Outlook Temperature Humidity   Win