In [1]:
import pandas as pd
import numpy as np

# Understanding Decision Trees

In [2]:
playing_golf = pd.read_csv("../../datasets/playing_golf2.csv", sep='\t')

playing_golf['Decision'] = np.where(playing_golf.Decision=='Yes', 1, 0).astype(str)
playing_golf

Unnamed: 0,Day,Outlook,Temp,Humidity,Wind,Decision
0,1,Sunny,Hot,High,Weak,0
1,2,Sunny,Hot,High,Strong,0
2,3,Overcast,Hot,High,Weak,1
3,4,Rain,Mild,High,Weak,1
4,5,Rain,Cool,Normal,Weak,1
5,6,Rain,Cool,Normal,Strong,0
6,7,Overcast,Cool,Normal,Strong,1
7,8,Sunny,Mild,High,Weak,0
8,9,Sunny,Cool,Normal,Weak,1
9,10,Rain,Mild,Normal,Weak,1


## CART - Classification and regression trees

* Top Down
* Gini

![Gini impurity](images/gini.png)

* Categorical and Numerical inputs


### Steps
1. Calculate Gini Index for each attribute
2. Calculated the weighted sum of Gini Indexes for that feature
3. Pick the attribute with lowest gini index value
4. Repeat 1, 2, 3 until a generalized tree has been created

In [3]:
X = playing_golf[['Outlook','Temp','Humidity','Wind']]
y = playing_golf.Decision
question_dict = []


for feature in X.columns:
    gini_index = 0
    for c_feature in X[feature].unique():
        len_feature_c = len(X.iloc[np.where(X[feature]==c_feature)])
       
        gini_impurity = 1
        for c in y.unique():
            feature_series = X.iloc[np.where(y == c)][feature]
            feature_series = feature_series.iloc[np.where(feature_series==c_feature)]
            # calculate the impurity for each class :
            # Gini(Outlook=Sunny) = 1 – (2/5)2 – (3/5)2 = 1 – 0.16 – 0.36 = 0.48
            # Gini(Outlook=Overcast) = 1 – (4/4)2 – (0/4)2 = 0
            # Gini(Outlook=Rain) = 1 – (3/5)2 – (2/5)2 = 1 – 0.36 – 0.16 = 0.48
            gini_impurity -= (len(feature_series)/len_feature_c)**2

        print("Gini({}={}) = {}".format(feature,c_feature, gini_impurity))
        # calculate weighted sum of gini indexes
        gini_index +=  len_feature_c/len(X)*gini_impurity
    print("Gini Index {} = {}\n".format(feature, gini_index))


Gini(Outlook=Sunny) = 0.48
Gini(Outlook=Overcast) = 0.0
Gini(Outlook=Rain) = 0.48
Gini Index Outlook = 0.34285714285714286

Gini(Temp=Hot) = 0.5
Gini(Temp=Mild) = 0.4444444444444444
Gini(Temp=Cool) = 0.375
Gini Index Temp = 0.44047619047619047

Gini(Humidity=High) = 0.48979591836734704
Gini(Humidity=Normal) = 0.24489795918367352
Gini Index Humidity = 0.3673469387755103

Gini(Wind=Weak) = 0.375
Gini(Wind=Strong) = 0.5
Gini Index Wind = 0.42857142857142855



In [4]:
def count_classes(label_serie):
    return label_serie.value_counts().to_dict()

def gini_inpurity(x,y):
    """Calculate the Gini Impurity for a list of rows."""
    counts = count_classes(y)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(x))
        impurity -= prob_of_lbl**2
    return impurity

In [5]:
for feature in X.columns:
    gini_index = 0
    for c_feature in X[feature].unique():
        split_feature_x = X.iloc[np.where(X[feature]==c_feature)]
        split_y = y[np.where(X[feature]==c_feature)[0]]
        print(gini_inpurity(split_feature_x, split_y))
        gini_index +=  len(split_feature_x)/len(X)*gini_inpurity(split_feature_x, split_y)
    print("Gini Index {} = {}\n".format(feature, gini_index))

0.48
0.0
0.48
Gini Index Outlook = 0.34285714285714286

0.5
0.4444444444444445
0.375
Gini Index Temp = 0.44047619047619047

0.48979591836734704
0.24489795918367355
Gini Index Humidity = 0.3673469387755103

0.375
0.5
Gini Index Wind = 0.42857142857142855

