In [2]:
import math

def calculate_gini(positive, negative):
    """Calculates the Gini index for a given set of positive and negative samples."""
    total = positive + negative
    if total == 0:
        return 0

    p_positive = positive / total
    p_negative = negative / total
    gini = 1 - (p_positive**2 + p_negative**2)
    return gini


original_positive = 220
original_negative = 80
gini_before_split = calculate_gini(original_positive, original_negative)

print(f"a. Gini Index before splitting: {gini_before_split:.4f}")

left_positive = 90
left_negative = 10
total_left = left_positive + left_negative
gini_left = calculate_gini(left_positive, left_negative)
print(f"   Gini for Left subset: {gini_left:.4f}")

right_positive = 100
right_negative = 10
total_right = right_positive + right_negative
gini_right = calculate_gini(right_positive, right_negative)
print(f"   Gini for Right subset: {gini_right:.4f}")

total_samples_in_split = total_left + total_right

weighted_gini_after_split = (total_left / total_samples_in_split) * gini_left + \
                            (total_right / total_samples_in_split) * gini_right

print(f"b. Weighted Gini Index after splitting: {weighted_gini_after_split:.4f}")

if weighted_gini_after_split < gini_before_split:
    print(f"   The split improves purity, as {weighted_gini_after_split:.4f} < {gini_before_split:.4f}.")
elif weighted_gini_after_split > gini_before_split:
    print(f"   The split does NOT improve purity, as {weighted_gini_after_split:.4f} > {gini_before_split:.4f}.")
else:
    print(f"   The split does not change purity, as {weighted_gini_after_split:.4f} == {gini_before_split:.4f}.")



a. Gini Index before splitting: 0.3911
   Gini for Left subset: 0.1800
   Gini for Right subset: 0.1653
b. Weighted Gini Index after splitting: 0.1723
   The split improves purity, as 0.1723 < 0.3911.


In [4]:
import pandas as pd
import numpy as np

def calculate_sse(y_values):
    """Calculates the Sum of Squared Errors (SSE) for a list of Y values."""
    if len(y_values) == 0:
        return 0
    mean_y = np.mean(y_values)
    sse = np.sum([(y - mean_y)**2 for y in y_values])
    return sse


data = {
    'T': [1, 2, 3, 4, 5, 6, 7, 8],
    'X1': [1, 2, 3, 4, 5, 6, 7, 8],
    'X2': [5, 6, 8, 10, 12, 15, 18, 20],
    'Y': [10, 12, 15, 18, 21, 25, 28, 30]
}
df = pd.DataFrame(data)


initial_sse_root = calculate_sse(df['Y'].tolist())
print(f"Initial SSE for the root node (all data): {initial_sse_root:.2f}\n")

best_sse = float('inf')
best_split_variable = None
best_split_point = None
best_left_node = None
best_right_node = None

for col in ['X1', 'X2']:
    print(f"Evaluating splits for variable: {col}")
    unique_values = sorted(df[col].unique())


    split_points = [(unique_values[i] + unique_values[i+1]) / 2 for i in range(len(unique_values) - 1)]

    for split_point in split_points:

        left_node_df = df[df[col] <= split_point]
        right_node_df = df[df[col] > split_point]


        sse_left = calculate_sse(left_node_df['Y'].tolist())
        sse_right = calculate_sse(right_node_df['Y'].tolist())

        total_sse_current_split = sse_left + sse_right


        if total_sse_current_split < best_sse:
            best_sse = total_sse_current_split
            best_split_variable = col
            best_split_point = split_point
            best_left_node = left_node_df
            best_right_node = right_node_df


print(f"a. The best splitting point is: {best_split_variable} <= {best_split_point}")
print(f"   Minimum Total SSE for this split: {best_sse:.2f}")

print("\nb. Constructing the first split of the regression tree:")
print(f"   Splitting Rule: {best_split_variable} <= {best_split_point}")

print("\n   Left Child Node:")
print(f"     Data Points (X1, X2, Y):\n{best_left_node[['X1', 'X2', 'Y']]}")
print(f"     Y values: {best_left_node['Y'].tolist()}")
mean_y_left = np.mean(best_left_node['Y']) if not best_left_node.empty else 0
print(f"     Mean Y (prediction for this node): {mean_y_left:.2f}")
print(f"     SSE for Left Node: {calculate_sse(best_left_node['Y'].tolist()):.2f}")

print("\n   Right Child Node:")
print(f"     Data Points (X1, X2, Y):\n{best_right_node[['X1', 'X2', 'Y']]}")
print(f"     Y values: {best_right_node['Y'].tolist()}")
mean_y_right = np.mean(best_right_node['Y']) if not best_right_node.empty else 0
print(f"     Mean Y (prediction for this node): {mean_y_right:.2f}")
print(f"     SSE for Right Node: {calculate_sse(best_right_node['Y'].tolist()):.2f}")



Initial SSE for the root node (all data): 382.88

Evaluating splits for variable: X1
Evaluating splits for variable: X2
a. The best splitting point is: X1 <= 4.5
   Minimum Total SSE for this split: 82.75

b. Constructing the first split of the regression tree:
   Splitting Rule: X1 <= 4.5

   Left Child Node:
     Data Points (X1, X2, Y):
   X1  X2   Y
0   1   5  10
1   2   6  12
2   3   8  15
3   4  10  18
     Y values: [10, 12, 15, 18]
     Mean Y (prediction for this node): 13.75
     SSE for Left Node: 36.75

   Right Child Node:
     Data Points (X1, X2, Y):
   X1  X2   Y
4   5  12  21
5   6  15  25
6   7  18  28
7   8  20  30
     Y values: [21, 25, 28, 30]
     Mean Y (prediction for this node): 26.00
     SSE for Right Node: 46.00
