# Gini Index before splitting

In [16]:
# Total samples
n_total = 300
pos_total = 220
neg_total = 80

# Probabilities
p_pos = pos_total / n_total
p_neg = neg_total / n_total

# Gini index before split
gini_before = 1 - (p_pos**2 + p_neg**2)
gini_before

0.3911111111111112

# Gini Index After splitting into two subsets

In [17]:
# Subset 1 (left): 90 positive, 10 negative
n_left = 100
pos_left = 90
neg_left = 10

# Subset 2 (right): 100 positive, 100 negative
n_right = 200
pos_right = 100
neg_right = 100

# Gini for left subset
p_pos_left = pos_left / n_left
p_neg_left = neg_left / n_left
gini_left = 1 - (p_pos_left**2 + p_neg_left**2)

# Gini for right subset
p_pos_right = pos_right / n_right
p_neg_right = neg_right / n_right
gini_right = 1 - (p_pos_right**2 + p_neg_right**2)

# Weighted Gini after split
gini_weighted = (n_left / (n_left + n_right)) * gini_left + \
                (n_right / (n_left + n_right)) * gini_right

gini_left, gini_right, gini_weighted

(0.17999999999999994, 0.5, 0.3933333333333333)

#SSE to determine best splitting point for x

In [18]:
import pandas as pd

# Data
df = pd.DataFrame({
    'X1': [1, 2, 3, 4, 5, 6, 7, 8],
    'X2': [5, 6, 8,10,12,15,18,20],
    'Y' : [10,12,15,18,21,25,28,30]
})

# Compute SSE for splitting `var` at threshold `t`
def total_sse(df, var, t):
    left  = df[df[var] <= t]['Y']
    right = df[df[var] >  t]['Y']
    return ((left  - left.mean())**2).sum() + ((right - right.mean())**2).sum()

# Gather all candidate splits (midpoints of each feature's sorted values)
candidates = []
for var in ['X1','X2']:
    vals = sorted(df[var].unique())
    mids = [(a + b)/2 for a, b in zip(vals, vals[1:])]
    for mid in mids:
        candidates.append((var, mid, total_sse(df, var, mid)))

# Find the best overall split
best_var, best_thresh, best_sse = min(candidates, key=lambda x: x[2])

# Display result
print(f"Best split overall: {best_var} ≤ {best_thresh} (total SSE = {best_sse:.2f})")

Best split overall: X1 ≤ 4.5 (total SSE = 82.75)


# Constructing the First Regression‑Tree Split

In [19]:
# Compute leaf predictions
left_mask  = df[best_var] <= best_thresh
right_mask = df[best_var] >  best_thresh
y_left  = df[left_mask]['Y'].mean()
y_right = df[right_mask]['Y'].mean()

# Output
print(f" - If {best_var} ≤ {best_thresh}: predict ŷ = {y_left:.2f}")
print(f" - Else: predict ŷ = {y_right:.2f}")

 - If X1 ≤ 4.5: predict ŷ = 13.75
 - Else: predict ŷ = 26.00
