In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### Read data

In [2]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,sun,rain,wind,game
0,1,1,0,yes
1,1,1,0,yes
2,0,1,1,no
3,0,1,1,no
4,1,0,0,yes
5,1,0,0,yes
6,1,0,0,yes
7,1,0,1,yes
8,1,0,1,yes
9,0,1,0,no


# First Tree

### Calculate weights

In [3]:
df['weights'] = 1/len(df)
df

Unnamed: 0,sun,rain,wind,game,weights
0,1,1,0,yes,0.071429
1,1,1,0,yes,0.071429
2,0,1,1,no,0.071429
3,0,1,1,no,0.071429
4,1,0,0,yes,0.071429
5,1,0,0,yes,0.071429
6,1,0,0,yes,0.071429
7,1,0,1,yes,0.071429
8,1,0,1,yes,0.071429
9,0,1,0,no,0.071429


### Calculate parent Gini impurity

<img src="pics/gini_parent.png" style="width: 30%;"/>

In [4]:
def gini_parent(df, weights_col_name):
    # Defining the weights
    w_yes = np.sum(df[weights_col_name].values[np.where(df["game"] == "yes")]) # values convert pandas into np array
    w_no = np.sum(df[weights_col_name].values[np.where(df["game"] == "no")])
    w_total = df[weights_col_name].sum()

    # Probabilities
    p_yes = w_yes/w_total
    p_no = w_no/w_total

    # Calculate parent Gini
    parent_gini = (1-(p_yes**2+p_no**2)).round(3)
    return parent_gini

In [5]:
parent_gini = gini_parent(df, weights_col_name='weights')
parent_gini

np.float64(0.459)

### Gini child and Gini reduction

<img src="pics/gini_sum.png" style="width: 55%;"/>

<img src="pics/gini_child.png" style="width: 40%;"/>

<img src="pics/gini_red.png" style="width: 40%;"/>

In [6]:
def gini_reduction(midpoint, df, child_col_names, weight_col_name, target_col_name, parent_impurity):
    results = []

    for child_col in child_col_names:
        df_filtered = df[[child_col, weight_col_name, target_col_name]].copy()

        # Split into left and right based on the midpoint
        df_left = df_filtered[df_filtered[child_col] <= midpoint].copy()
        df_right = df_filtered[df_filtered[child_col] >= midpoint].copy()

        # Gini weights
        w_left_a = np.sum(df_left[weight_col_name].values[np.where(df_left[target_col_name] == "yes")]) 
        w_left_b = np.sum(df_left[weight_col_name].values[np.where(df_left[target_col_name] == "no")]) 
        w_left_total = df_left[weight_col_name].sum()

        w_right_a = np.sum(df_right[weight_col_name].values[np.where(df_right[target_col_name] == "yes")]) 
        w_right_b = np.sum(df_right[weight_col_name].values[np.where(df_right[target_col_name] == "no")]) 
        w_right_total = df_right[weight_col_name].sum()

        # Probabilities
        p_left_a = w_left_a / w_left_total if w_left_total != 0 else 0
        p_left_b = w_left_b / w_left_total if w_left_total != 0 else 0

        p_right_a = w_right_a / w_right_total if w_right_total != 0 else 0
        p_right_b = w_right_b / w_right_total if w_right_total != 0 else 0

        # Gini impurities
        gini_left = 1 - (p_left_a**2 + p_left_b**2)
        gini_right = 1 - (p_right_a**2 + p_right_b**2)

        # Weighted Gini impurities
        n_left = len(df_left)
        n_right = len(df_right)
        n_total = n_left + n_right

        gini_left_w = gini_left * n_left / n_total if n_total != 0 else 0
        gini_right_w = gini_right * n_right / n_total if n_total != 0 else 0

        # Gini impurity reduction
        gini_reduction_value = parent_impurity - (gini_left_w + gini_right_w)

        results.append({'child_col_name': child_col, 'gini_reduction': gini_reduction_value})

    return pd.DataFrame(results).sort_values(by='gini_reduction',ascending=False)



In [7]:
reduction = gini_reduction(midpoint=0.5, 
                            df=df,
                            child_col_names=['sun', 'wind', 'rain'], 
                            weight_col_name='weights',
                            target_col_name='game',
                            parent_impurity=parent_gini
                            )
reduction

Unnamed: 0,child_col_name,gini_reduction
0,sun,0.254918
2,rain,0.254918
1,wind,0.000667


### Predict with the tree using the best split point

In [8]:
df['pred-1'] = np.where(df['rain'] > 0.5, 'no', 'yes')
df

Unnamed: 0,sun,rain,wind,game,weights,pred-1
0,1,1,0,yes,0.071429,no
1,1,1,0,yes,0.071429,no
2,0,1,1,no,0.071429,no
3,0,1,1,no,0.071429,no
4,1,0,0,yes,0.071429,yes
5,1,0,0,yes,0.071429,yes
6,1,0,0,yes,0.071429,yes
7,1,0,1,yes,0.071429,yes
8,1,0,1,yes,0.071429,yes
9,0,1,0,no,0.071429,no


### Calculate error of the tree

In [9]:
# Calculate error
error = df['weights'].where(df['game'] != df['pred-1']).sum()
error

np.float64(0.14285714285714285)

### Calculate importance of the tree

<img src="pics/importance.png" style="width: 30%;"/>

In [10]:
lr=1

imp_1 = lr*0.5*np.log(error/(1-error))
imp_1

np.float64(-0.8958797346140275)

### Calculate new weights for wrong guesses

<img src="pics/new_weights.png" style="width: 30%;"/>

In [11]:
# Calculate new weights
df['weights_new_1'] = np.where(df['game'] != df['pred-1'], 
                                  df['weights'] * np.exp(imp_1),
                                  df['weights']
                                  )
df

Unnamed: 0,sun,rain,wind,game,weights,pred-1,weights_new_1
0,1,1,0,yes,0.071429,no,0.029161
1,1,1,0,yes,0.071429,no,0.029161
2,0,1,1,no,0.071429,no,0.071429
3,0,1,1,no,0.071429,no,0.071429
4,1,0,0,yes,0.071429,yes,0.071429
5,1,0,0,yes,0.071429,yes,0.071429
6,1,0,0,yes,0.071429,yes,0.071429
7,1,0,1,yes,0.071429,yes,0.071429
8,1,0,1,yes,0.071429,yes,0.071429
9,0,1,0,no,0.071429,no,0.071429


### Calculate normalized weights

<img src="pics/w_norm.png" style="width: 30%;"/>

In [12]:
df['weights_norm_1'] = df['weights_new_1']/df['weights_new_1'].sum()
df

Unnamed: 0,sun,rain,wind,game,weights,pred-1,weights_new_1,weights_norm_1
0,1,1,0,yes,0.071429,no,0.029161,0.031853
1,1,1,0,yes,0.071429,no,0.029161,0.031853
2,0,1,1,no,0.071429,no,0.071429,0.078024
3,0,1,1,no,0.071429,no,0.071429,0.078024
4,1,0,0,yes,0.071429,yes,0.071429,0.078024
5,1,0,0,yes,0.071429,yes,0.071429,0.078024
6,1,0,0,yes,0.071429,yes,0.071429,0.078024
7,1,0,1,yes,0.071429,yes,0.071429,0.078024
8,1,0,1,yes,0.071429,yes,0.071429,0.078024
9,0,1,0,no,0.071429,no,0.071429,0.078024


# Second Tree

In [13]:
parent_gini = gini_parent(df, weights_col_name='weights_norm_1')
parent_gini

np.float64(0.476)

In [14]:
reduction = gini_reduction(midpoint=0.5, 
                            df=df,
                            child_col_names=['sun', 'wind', 'rain'], 
                            weight_col_name='weights_norm_1',
                            target_col_name='game',
                            parent_impurity=parent_gini
                            )
reduction

Unnamed: 0,child_col_name,gini_reduction
2,rain,0.355329
0,sun,0.271918
1,wind,0.003909


In [15]:
df['pred-2'] = np.where(df['rain'] > 0.5, 'no', 'yes')
df

Unnamed: 0,sun,rain,wind,game,weights,pred-1,weights_new_1,weights_norm_1,pred-2
0,1,1,0,yes,0.071429,no,0.029161,0.031853,no
1,1,1,0,yes,0.071429,no,0.029161,0.031853,no
2,0,1,1,no,0.071429,no,0.071429,0.078024,no
3,0,1,1,no,0.071429,no,0.071429,0.078024,no
4,1,0,0,yes,0.071429,yes,0.071429,0.078024,yes
5,1,0,0,yes,0.071429,yes,0.071429,0.078024,yes
6,1,0,0,yes,0.071429,yes,0.071429,0.078024,yes
7,1,0,1,yes,0.071429,yes,0.071429,0.078024,yes
8,1,0,1,yes,0.071429,yes,0.071429,0.078024,yes
9,0,1,0,no,0.071429,no,0.071429,0.078024,no


In [16]:
# Calculate error
error = df['weights_norm_1'].where(df['game'] != df['pred-2']).sum()
error

np.float64(0.06370669049627475)

In [17]:
lr=1

imp_2 = lr*0.5*np.log(error/(1-error))
imp_2

np.float64(-1.343819601921041)

In [18]:
df['weights_new_2'] = np.where(df['game'] != df['pred-2'], 
                                  df['weights_norm_1'] * np.exp(imp_2),
                                  df['weights_norm_1']
                                  )


df['weights_norm_2'] = df['weights_new_2']/df['weights_new_2'].sum()
df

Unnamed: 0,sun,rain,wind,game,weights,pred-1,weights_new_1,weights_norm_1,pred-2,weights_new_2,weights_norm_2
0,1,1,0,yes,0.071429,no,0.029161,0.031853,no,0.008309,0.008719
1,1,1,0,yes,0.071429,no,0.029161,0.031853,no,0.008309,0.008719
2,0,1,1,no,0.071429,no,0.071429,0.078024,no,0.078024,0.08188
3,0,1,1,no,0.071429,no,0.071429,0.078024,no,0.078024,0.08188
4,1,0,0,yes,0.071429,yes,0.071429,0.078024,yes,0.078024,0.08188
5,1,0,0,yes,0.071429,yes,0.071429,0.078024,yes,0.078024,0.08188
6,1,0,0,yes,0.071429,yes,0.071429,0.078024,yes,0.078024,0.08188
7,1,0,1,yes,0.071429,yes,0.071429,0.078024,yes,0.078024,0.08188
8,1,0,1,yes,0.071429,yes,0.071429,0.078024,yes,0.078024,0.08188
9,0,1,0,no,0.071429,no,0.071429,0.078024,no,0.078024,0.08188


# Predict on unseen data

In [19]:
print(imp_1)
print(imp_2)

-0.8958797346140275
-1.343819601921041


In [None]:
# Is going the game take place if rain=0?
# Unseen data sample 
feature_name = 'rain'
feature_value = 0

X_unseen = {feature_name: feature_value}  

In [21]:
# Given split points and importances
split_1 = 0.5  # Split point for Tree 1 (feature1)
split_2 = 0.5  # Split point for Tree 2 (feature2)

alpha_1 = imp_1  # Importance of Tree 1
alpha_2 = imp_2  # Importance of Tree 2


# Tree 1 Prediction
h1 = 1 if X_unseen[feature_name] > split_1 else -1  

# Tree 2 Prediction
h2 = 1 if X_unseen[feature_name] > split_2 else -1  

# Compute weighted sum of predictions
F_x = alpha_1 * h1 + alpha_2 * h2

# Final classification
y_pred = np.sign(F_x)

print(f"Tree 1 Prediction: {h1}, Tree 2 Prediction: {h2}")
print(f"Weighted Sum: {F_x}")
print(f"Final Prediction: {y_pred}")


Tree 1 Prediction: -1, Tree 2 Prediction: -1
Weighted Sum: 2.2396993365350686
Final Prediction: 1.0


<img src="pics/final.png" style="width: 60%;"/>