In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

# Decision Tree
Determine threshold values for sepal.length, sepal.width, petal.length and petal.width that make it more likely for the data point to be of a specific iris variety. Useful Link: https://www.youtube.com/watch?v=_L39rN6gz7Y
Steps:
    1. determine which feature is most "meaningful", this should evaluated at the root --> Gini impurity (other methods: Entropy, Information Gain)

## Gini Impurity for numerical data
 
formula:
gini impurity for a leaf = 1-P(Yes)^2-P(No)^2

In [None]:
import os

In [3]:
def calc_gini_impurity(df, feature, variety) : 
    min_gini = 1
    threshold = 0
    direction = 'none'
    for i in range(df.shape[0]-1) :
        # step 1: calculate average between pairs in the sorted feature column as potential thresholds
        #feature_idx= df[feature]
        #print(i, feature, feature_idx)
        avg = (df[feature].iloc[i] + df[feature].iloc[i+1])/2
    
        # step 2: separate df by threshold
        left_branch = df[df[feature] < avg]
        right_branch = df[df[feature] >= avg]
        
        #print(left_branch.shape[0], right_branch.shape[0])
        if left_branch.shape[0] == 0 or right_branch.shape[0] == 0: continue
    
        # step 3: separate branches by variety (yes/no)
        l_vc_yes = left_branch[left_branch['variety'] == variety].shape[0]
        r_vc_yes = right_branch[right_branch['variety'] == variety].shape[0]
    
        l_vc_no = left_branch.shape[0] - l_vc_yes
        r_vc_no = right_branch.shape[0] - r_vc_yes
    
        # step 5 calculate Gini Impurity
        gini_left = 1 - (l_vc_yes/left_branch.shape[0])**2 - (l_vc_no/left_branch.shape[0])**2
        gini_right = 1 - (r_vc_yes/right_branch.shape[0])**2 - (r_vc_no/right_branch.shape[0])**2
    
        total_gini = (left_branch.shape[0]/df.shape[0]) * gini_left + (right_branch.shape[0]/df.shape[0]) * gini_right
        if total_gini < min_gini : 
            min_gini = total_gini
            threshold = avg
            if l_vc_yes > r_vc_yes : direction = 'left'
            else : direction = 'right'
    
    return(min_gini, threshold, direction) #l_g, r_g, l_yes, r_yes, l_no, r_no)

In [None]:
def print_gini_values(df) : 
    for variety in df['variety'].unique(): 
    print(f'\n{variety}:')
    for feature in df.columns[:-1]:
        print(f'Gini for {feature}: {calc_gini_impurity(df, feature, variety)}')
    return