In [3]:
import pandas as pd

In [4]:
golf = pd.read_csv('Class_data/golf.csv')

In [22]:
golf['Outlook'].unique()

array(['Sunny', 'Overcast', 'Rain'], dtype=object)

## Information Gain

In [6]:
import numpy as np

In [21]:
# If there was no partititoning, based on any attribute
yes_count = golf[golf['Play'] == 'yes'].shape[0]
no_count = golf[golf['Play'] == 'no'].shape[0]
count = golf.shape[0]

entropy_no_partitions = \
    - (yes_count/count)*np.log2(yes_count/count) \
    - (no_count/count)*np.log2(no_count/count)

entropy_no_partitions

np.float64(0.9402859586706311)

In [None]:
# How much do we information gain from including outlook at the root node
sunny_yes = golf.query("Outlook == 'Sunny' & Play == 'yes'").shape[0]
sunny_no = golf.query("Outlook == 'Sunny' & Play == 'no'").shape[0]

overcast_yes = golf.query("Outlook == 'Overcast' & Play == 'yes'").shape[0]
overcast_no = golf.query("Outlook == 'Overcast' & Play == 'no'").shape[0]

rain_yes = golf.query("Outlook == 'Rain' & Play == 'yes'").shape[0]
rain_no = golf.query("Outlook == 'Rain' & Play == 'no'").shape[0]

In [38]:
def calculate_entropy(yes_labels, no_labels):
    if yes_labels == 0 or no_labels == 0:
        return np.float64(0)
    total = yes_labels + no_labels
    return -(yes_labels/total)*np.log2(yes_labels/total)\
        -(no_labels/total)*np.log2(no_labels/total)

In [39]:
sunny_entrop = calculate_entropy(sunny_yes, sunny_no)
overcast_entrop = calculate_entropy(overcast_yes, overcast_no)
rain_entrop = calculate_entropy(rain_yes, rain_no)

In [35]:
sunny_entrop

np.float64(0.9709505944546686)

In [40]:
overcast_entrop

np.float64(0.0)

In [37]:
rain_entrop

np.float64(0.9709505944546686)

Information Gain from Outlook

In [45]:
# Calculate total entropy based on the Outlook categorical value
entropy_outlook = \
    (sunny_yes+sunny_no)/count * sunny_entrop +\
    (overcast_yes+overcast_no)/count * overcast_entrop +\
    (rain_yes+rain_no)/count * rain_entrop

information_gain_outlook = entropy_no_partitions - entropy_outlook
information_gain_outlook

np.float64(0.24674981977443933)

**Split Info**: Used to adjust for bias of Information Gain towards splits with distinct values.<br>
It is calculated with:
$$
\text{SplitInfo}(S, A) = - \sum_{i=1}^{k} \frac{|S_i|}{|S|} \log_2 \left(\frac{|S_i|}{|S|}\right)
$$

In [46]:
split_info = \
    -(sunny_yes+sunny_no)/count * np.log2((sunny_yes+sunny_no)/count) +\
    -(overcast_yes+overcast_no)/count * np.log2((overcast_yes+overcast_no)/count) +\
    -(rain_yes+rain_no)/count * np.log2((rain_yes+rain_no)/count)

split_info

np.float64(1.5774062828523454)

## Gain Ratio
$$ = \frac{\text{Information Gain}}{\text{Split Info}} $$

In [48]:
gain_ratio_outlook = information_gain_outlook / split_info
gain_ratio_outlook

np.float64(0.15642756242117528)

A low gain ratio shows us that the attribute does not do a good job at splitting the data into even subsets

## Gini Index: Measure of Impurity for Split

In [92]:
def calculate_gini(
        data: pd.DataFrame, 
        categories: list[str], 
        attribute: str, 
        label_name:str, 
        yes_label: str, 
        no_label:str
    ) -> np.float64:

    gini = 0
    total_count = data.shape[0]

    for category in categories:
        cat_data = data[data[attribute] == category]
        cat_count = len(cat_data)

        if cat_count == 0:
            continue

        yes_count = (cat_data[cat_data[label_name] == yes_label]).shape[0]
        no_count = (cat_data[cat_data[label_name] == no_label]).shape[0]

        gini += (1 - (yes_count/cat_count)**2 - (no_count/cat_count)**2) * (cat_count/total_count)
        
    return gini

In [93]:
outlook_cats = list(golf['Outlook'].unique())
outlook_gini = calculate_gini(
    data=golf, 
    categories=outlook_cats,
    attribute="Outlook", 
    label_name="Play", 
    yes_label="yes", 
    no_label="no")

print(outlook_gini)

0.34285714285714286


In [94]:
# We can do it for the other variables, but we'll use the data that's been placed into buckets
golf_categories = pd.read_excel('Class_Data/golf_categories.xlsx')

features = list(golf_categories.columns)
features.remove('Play')

labels = list(golf_categories['Play'].unique())

display(features)
display(labels)

['Temperature', 'Humidity', 'Outlook', 'Wind']

['No', 'Yes']

In [97]:
gini_indexes = dict()
for feature in features:
    gini = 0
    categories = np.array(golf_categories[f'{feature}'].unique())
    gini_indexes[feature] = calculate_gini(
        data=golf_categories,
        categories=categories,
        attribute=feature,
        label_name='Play',
        yes_label='Yes',
        no_label='No'
    )

for feature, gini_index in gini_indexes.items():
    print(f"{feature}: {gini_index}")

Temperature: 0.44047619047619047
Humidity: 0.4285714285714285
Outlook: 0.34285714285714286
Wind: 0.42857142857142855


Our tree will select the lowest gini index for which variable it will decide to split off at the rootnode.<br><br>
In this case, it will use outlook for this first split