In [1]:
from sklearn import datasets
import pandas as pd
import math  
from collections import Counter

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [5]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')


In [6]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [7]:
df.columns

Index(['sl_labeled', 'sw_labeled', 'pl_labeled', 'pw_labeled'], dtype='object')

In [8]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [9]:
def entropy(y):
    pro = 0
    for i in set(y['class']): # iterating over the set values 0,1,2 in y
        a1 = (y['class']==i)
#         calculating probablity
        pro += (len(y[a1])/len(y))*(math.log(len(y[a1])/len(y),2))
    if pro==0:
        return pro
    else:
        return (-1)*pro

In [11]:
def build_tree(df, y, unused_features,level):
    #base case
    # 1.  pure node has reached(y has same value)   
    if  len(set(y['class']))==1:
    
        print("reached leaf node")
        print("Entropy",entropy(y))
        print("\n")
#   2. unused is empty   
    elif len(unused_features) is 0:
        print("reached leaf node")
        print("\n")
        
# otherwise find the best feature to split 
    else:
        best_feature = ""
        
#         initially maximum gain is 0
        max_gain_ratio=0 
    
    
#         for each unused feature we calc gain 
        for f in unused_features:
        
#     possible values in any one feature
            possible_values=set(df[f])
    
#     initially maximum entropy is 0
            max_entropy=0
            split=0
#         possible values are a,b,c,d
            for val in possible_values:
                a = (df[f]==val)
                max_entropy =len(y[a])/(len(df[f]))
                split+=(len(df[a])/len(df[f]))*(math.log(len(df[a])/(len(df[f])),2))
            split*=(-1)  
            
    
#     information gain =(entropy of parent node)-(entropy of current node)
        
        gain_info=entropy(y)-max_entropy
        
        gain_ratio=gain_info/split
        
#        selecting the feature with maximum gain ratio
        if max_gain_ratio<gain_ratio:
                max_gain_ratio=gain_ratio
                best_feature=f
         
        
        
        
        print("Level:",level)
        print("Current Entropy:",entropy(y))
        print("Best Feature: ", best_feature)
        print("With gain ratio:",max_gain_ratio)
        print("Counts of:-")
        print(y['class'].value_counts())
        
        print("")
    
# increasing level for next split     
        level+=1
        
#     removing the best feature from unused_features    
#     iterating over all the possible values in best_feature
#     recursive calling of build_tree
        
        unused_features.remove(best_feature)
        for j in set(df[best_feature]):
            att=df[best_feature]==j
            
            build_tree(df[att],y[att],unused_features,level)

In [12]:
y = pd.DataFrame(iris.target)
# naming the y column as class 
y.columns = ["class"]
unused_features = set(df.columns)
# initially level is 0
level=0
build_tree(df,y,unused_features,level)

Level: 0
Current Entropy: 1.584962500721156
Best Feature:  pw_labeled
With gain ratio: 0.6713244339170225
Counts of:-
2    50
1    50
0    50
Name: class, dtype: int64

reached leaf node
Entropy 0.0


reached leaf node
Entropy 0.0


reached leaf node
Entropy 0.0


Level: 1
Current Entropy: 0.863120568566631
Best Feature:  sl_labeled
With gain ratio: 0.1766801418792336
Counts of:-
1    40
2    16
Name: class, dtype: int64

Level: 2
Current Entropy: 0.9709505944546686
Best Feature:  sw_labeled
With gain ratio: 0.3820488875265707
Counts of:-
2    3
1    2
Name: class, dtype: int64

reached leaf node
Entropy 0.0


Level: 3
Current Entropy: 0.9182958340544896
Best Feature:  pl_labeled
With gain ratio: 0.274017542121281
Counts of:-
1    2
2    1
Name: class, dtype: int64

reached leaf node
Entropy 0.0


reached leaf node
Entropy 0.0


reached leaf node
Entropy 0.0


reached leaf node
Entropy 0.0


reached leaf node


