<a href="https://colab.research.google.com/github/Vivek21704/Introduction-to-ml/blob/main/vivek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
def gini_index(n_positive: int, n_negative: int) -> float:
    total = n_positive + n_negative
    p_pos = n_positive / total
    p_neg = n_negative / total
    return 1 - (p_pos ** 2 + p_neg ** 2)

# Example for 220 positive and 80 negative samples
gini = gini_index(220, 80)
print(f"Gini index: {gini:.4f}")

def gini_index(n_pos: int, n_neg: int) -> float:
    n = n_pos + n_neg
    p_pos = n_pos / n
    p_neg = n_neg / n
    return 1 - (p_pos**2 + p_neg**2)

# Child nodes
left_gini  = gini_index(90, 10)
right_gini = gini_index(100, 100)

# Weighted Gini
total = 300
weighted_gini = (100/total) * left_gini + (200/total) * right_gini

print(f"Left  Gini: {left_gini:.3f}")
print(f"Right Gini: {right_gini:.3f}")
print(f"Weighted Gini after split: {weighted_gini:.3f}")


import numpy as np
import pandas as pd

# ------------- dataset -------------
x1 = np.array([1, 2, 3, 4, 5, 6, 7, 8])
x2 = np.array([5, 6, 8, 10, 12, 15, 18, 20])   # not needed for the first split
y  = np.array([10, 12, 15, 18, 21, 25, 28, 30])

# ------------- helper -------------
def sse(vec):
    """Sum of squared errors about the mean of `vec`."""
    return np.sum((vec - vec.mean())**2)

# ------------- search every split on x1 -------------
results = []
for i in range(1, len(x1)):
    split_point = 0.5 * (x1[i-1] + x1[i])      # halfway between consecutive x1 values
    left_sse    = sse(y[:i])
    right_sse   = sse(y[i:])
    total_sse   = left_sse + right_sse
    results.append((split_point, left_sse, right_sse, total_sse))

# put everything in a DataFrame for neat printing
cols = ['split_at_x1', 'left_SSE', 'right_SSE', 'total_SSE']
df   = pd.DataFrame(results, columns=cols).round(2)

# identify the best split
best_row = df.loc[df['total_SSE'].idxmin()]

print(df.to_string(index=False))
print("\nBest split:")
print(best_row.to_string())



import numpy as np

# -------------------- 1.  DATA --------------------
# x1, x2, y in column order
X = np.array([
    [1,  5, 10],
    [2,  6, 12],
    [3,  8, 15],
    [4, 10, 18],
    [5, 12, 21],
    [6, 15, 25],
    [7, 18, 28],
    [8, 20, 30],
])
x1, x2, y = X[:, 0], X[:, 1], X[:, 2]

# -------------------- 2. HELPERS ------------------
def sse(vec: np.ndarray) -> float:
    """Sum of squared errors around the mean of vec."""
    return np.sum((vec - vec.mean())**2)

# -------------------- 3. SEARCH SPLITS ON x1 -----
best_thresh, best_total_sse = None, np.inf
candidates = []

for i in range(1, len(x1)):
    thresh = 0.5 * (x1[i - 1] + x1[i])        # mid‑point between consecutive x1s
    left_mask = x1 <= thresh
    right_mask = ~left_mask
    total = sse(y[left_mask]) + sse(y[right_mask])
    candidates.append((thresh, total))
    if total < best_total_sse:
        best_thresh, best_total_sse = thresh, total

# -------------------- 4. BUILD THE SPLIT ---------
left_mask  = x1 <= best_thresh
right_mask = ~left_mask

y_left, y_right = y[left_mask], y[right_mask]
pred_left, pred_right = y_left.mean(), y_right.mean()
left_sse, right_sse = sse(y_left), sse(y_right)

# -------------------- 5. REPORT ------------------
print("Candidate thresholds and their total SSE:")
for t, tot in candidates:
    print(f"  x1 <= {t:4.1f} : total SSE = {tot:7.2f}")

print("\nBest threshold (minimum SSE):")
print(f"  x1 <= {best_thresh}   → total SSE = {best_total_sse:.2f}\n")

print("--- First‑split regression tree ---")
print(f"Root: 8 samples, prediction = {y.mean():.2f}")
print(f"├─ Left child  (x1 <= {best_thresh}): "
      f"{left_mask.sum()} samples, prediction = {pred_left:.2f}, node SSE = {left_sse:.2f}")
print(f"└─ Right child (x1 >  {best_thresh}): "
      f"{right_mask.sum()} samples, prediction = {pred_right:.2f}, node SSE = {right_sse:.2f}")
print(f"Total SSE after split = {left_sse + right_sse:.2f}")

Gini index: 0.3911
Left  Gini: 0.180
Right Gini: 0.500
Weighted Gini after split: 0.393
 split_at_x1  left_SSE  right_SSE  total_SSE
         1.5      0.00     271.43     271.43
         2.5      2.00     170.83     172.83
         3.5     12.67      97.20     109.87
         4.5     36.75      46.00      82.75
         5.5     78.80      12.67      91.47
         6.5    158.83       2.00     160.83
         7.5    265.71       0.00     265.71

Best split:
split_at_x1     4.50
left_SSE       36.75
right_SSE      46.00
total_SSE      82.75
Candidate thresholds and their total SSE:
  x1 <=  1.5 : total SSE =  271.43
  x1 <=  2.5 : total SSE =  172.83
  x1 <=  3.5 : total SSE =  109.87
  x1 <=  4.5 : total SSE =   82.75
  x1 <=  5.5 : total SSE =   91.47
  x1 <=  6.5 : total SSE =  160.83
  x1 <=  7.5 : total SSE =  265.71

Best threshold (minimum SSE):
  x1 <= 4.5   → total SSE = 82.75

--- First‑split regression tree ---
Root: 8 samples, prediction = 19.88
├─ Left child  (x1 <= 4.5): 4 