<a href="https://colab.research.google.com/github/maggie20041027-svg/ECON3916-Statistical-Machine-Learning/blob/main/Lab%206/Lab_6_The_Architecture_of_Bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import seaborn as sns
import pandas as pd
import numpy as np

# 1. Data Ingestion (The Population)
df = sns.load_dataset('titanic')
print(f"Total Population: {len(df)}")
print(f"Population Survival Rate: {df['survived'].mean():.4f}")

# 2. Manual Shuffle (Simulation of Sampling)
# We set a seed to ensure reproducibility for the lesson,
# but in production, this variance happens naturally.
np.random.seed(2026)
indices = np.random.permutation(len(df))

Total Population: 891
Population Survival Rate: 0.3838


In [12]:
# 3. Cut the deck (80/20 Split)
split_point = int(len(df) * 0.8)

# Slicing the shuffled indices
train_idx = indices [:split_point]
test_idx =  indices [split_point:]

# Creating the subsets
train_set = df.iloc[train_idx]
test_set = df.iloc[test_idx]

# 4. Bias Check (The Delta)
train_surv = train_set['survived'].mean()
test_surv = test_set['survived'].mean()
delta = abs(train_surv - test_surv)

print(f"Train Survival Rate: {train_surv:.4f}")
print(f"Test Survival Rate:  {test_surv:.4f}")
print(f"Sampling Bias (Delta): {delta:.4f}")

Train Survival Rate: 0.3736
Test Survival Rate:  0.4246
Sampling Bias (Delta): 0.0510


In [13]:
from sklearn.model_selection import train_test_split

# Stratify by 'pclass' ensures the distribution of classes is identical
X_train, X_test = train_test_split(df, test_size=0.2, random_state=2026, stratify=df['pclass'])

print("\n--- Stratified Split ---")
print("Train Class Dist:\n", X_train['pclass'].value_counts(normalize=True))
print("Test Class Dist:\n", X_test['pclass'].value_counts(normalize=True))


--- Stratified Split ---
Train Class Dist:
 pclass
3    0.550562
1    0.242978
2    0.206461
Name: proportion, dtype: float64
Test Class Dist:
 pclass
3    0.553073
1    0.240223
2    0.206704
Name: proportion, dtype: float64


In [14]:
import numpy as np
from scipy.stats import chisquare

# A/B Test Sample Ratio Mismatch (SRM) Forensic Check
print("=" * 70)
print("A/B Test Forensic Check: Sample Ratio Mismatch Detection")
print("=" * 70)
print()

# 1. Define the observed and expected arrays
total_users = 1000
observed = np.array([450, 550])  # [Control, Treatment]
expected = np.array([500, 500])  # Expected 50/50 split

print(f"Total Users: {total_users}")
print(f"Observed:    Control = {observed[0]}, Treatment = {observed[1]}")
print(f"Expected:    Control = {expected[0]}, Treatment = {expected[1]}")
print(f"Split Ratio: {observed[0]/total_users:.1%} / {observed[1]/total_users:.1%}")
print()

# 2. Calculate the Chi-Square statistic and p-value
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print(f"Chi-Square Statistic: {chi2_stat:.4f}")
print(f"P-value:              {p_value:.6f}")
print()

# 3. Print conclusion
print("=" * 70)
print("CONCLUSION")
print("=" * 70)
if p_value < 0.01:
    print("ðŸš¨ CRITICAL FAILURE: Sample Ratio Mismatch (SRM) Detected.")
    print("   Check Load Balancer.")
else:
    print("âœ“ Variance is within natural limits.")
print()

# Additional context
print("=" * 70)
print("WHY 550/450 IS NOT JUST 'BAD LUCK'")
print("=" * 70)
print()
print("1. STATISTICAL SIGNIFICANCE:")
print(f"   The p-value of {p_value:.6f} means there's only a {p_value*100:.4f}% chance")
print("   of seeing this extreme split (or worse) by random chance alone.")
print()
print("2. MAGNITUDE OF DEVIATION:")
print(f"   You're off by {abs(observed[0] - expected[0])} users per group (10% deviation).")
print("   With 1000 users, random chance typically produces deviations of Â±3%.")
print()
print("3. EXPECTED RANDOMIZATION VARIANCE:")
print("   In a true 50/50 random split with 1000 users, the standard error is:")
print(f"   SE = sqrt(n * p * (1-p)) = sqrt(1000 * 0.5 * 0.5) â‰ˆ {np.sqrt(1000 * 0.5 * 0.5):.1f} users")
print(f"   Your deviation is {abs(observed[0] - expected[0])/np.sqrt(1000 * 0.5 * 0.5):.1f} standard errors away.")
print()
print("4. PRACTICAL IMPLICATIONS:")
print("   - A 10% imbalance can bias your treatment effect estimates")
print("   - If groups differ systematically (not just in size), your results are invalid")
print("   - Common causes: faulty randomization, bot traffic, data pipeline errors")
print()

# Simulation: What does "normal variance" look like?
print("=" * 70)
print("SIMULATION: What does random variance actually look like?")
print("=" * 70)
np.random.seed(42)
n_simulations = 10000
control_counts = np.random.binomial(n=1000, p=0.5, size=n_simulations)

# Count how many times we see a split as extreme as 450/550
extreme_splits = np.sum((control_counts <= 450) | (control_counts >= 550))
probability = extreme_splits / n_simulations

print(f"In {n_simulations:,} simulated 50/50 random splits:")
print(f"Splits as extreme as 450/550: {extreme_splits} times ({probability*100:.2f}%)")
print()
print("This confirms the Chi-Square result: such an imbalance is highly unlikely")
print("to occur by chance and indicates a systematic problem.")

A/B Test Forensic Check: Sample Ratio Mismatch Detection

Total Users: 1000
Observed:    Control = 450, Treatment = 550
Expected:    Control = 500, Treatment = 500
Split Ratio: 45.0% / 55.0%

Chi-Square Statistic: 10.0000
P-value:              0.001565

CONCLUSION
ðŸš¨ CRITICAL FAILURE: Sample Ratio Mismatch (SRM) Detected.
   Check Load Balancer.

WHY 550/450 IS NOT JUST 'BAD LUCK'

1. STATISTICAL SIGNIFICANCE:
   The p-value of 0.001565 means there's only a 0.1565% chance
   of seeing this extreme split (or worse) by random chance alone.

2. MAGNITUDE OF DEVIATION:
   You're off by 50 users per group (10% deviation).
   With 1000 users, random chance typically produces deviations of Â±3%.

3. EXPECTED RANDOMIZATION VARIANCE:
   In a true 50/50 random split with 1000 users, the standard error is:
   SE = sqrt(n * p * (1-p)) = sqrt(1000 * 0.5 * 0.5) â‰ˆ 15.8 users
   Your deviation is 3.2 standard errors away.

4. PRACTICAL IMPLICATIONS:
   - A 10% imbalance can bias your treatment eff