<a href="https://colab.research.google.com/github/lizzietsitsishvili/ECON3916-Statistical-Machine-Learning/blob/main/Lab%206/%5BLab_6%5D_The_Architecture_of_Bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import seaborn as sns
import pandas as pd
import numpy as np

# 1. Data Ingestion (The Population)
df = sns.load_dataset('titanic')
print(f"Total Population: {len(df)}")
print(f"Population Survival Rate: {df['survived'].mean():.4f}")

# 2. Manual Shuffle (Simulation of Sampling)
# We set a seed to ensure reproducibility for the lesson,
# but in production, this variance happens naturally.
np.random.seed(2026)
indices = np.random.permutation(len(df))

Total Population: 891
Population Survival Rate: 0.3838


In [8]:
indices

array([747, 734, 871, 242, 527, 885, 121, 520, 303, 167, 203, 263, 260,
       567,  92, 494, 574, 738, 309, 545, 851, 218, 358, 664, 479, 237,
       442, 134, 559, 705, 101, 212, 408, 863, 250, 506, 283, 179,  74,
       639, 313,  16, 610, 504, 233, 542, 405, 831, 498, 466, 810, 298,
       426, 206, 427, 507,  56, 641, 872, 796,  52, 103, 148, 865, 323,
       197, 510, 317, 737, 839,   6, 374, 146, 253, 844, 634, 618, 330,
       173, 825, 389, 243,  39, 779, 696, 501,  18, 751, 882,   5, 123,
       760, 687, 593, 695, 274, 553,   0, 159, 730, 422, 602,  51, 402,
       461, 325, 151, 535,  26, 380, 855, 525, 670, 693, 764, 141, 650,
        57, 457, 293, 372,  88, 443, 377, 489, 561, 321, 879, 262,  62,
       643, 615, 522, 883, 714, 811, 596, 284, 172, 232, 236, 220, 439,
       625, 556, 716, 211,   2,  30, 688, 199, 712, 647, 409, 315, 452,
       866, 305, 793, 573, 732, 198, 791, 279, 757, 663, 539, 354, 360,
        71, 674,  60, 105, 118, 588, 807, 285, 114,  76, 890, 33

In [12]:
# 3. Cut the deck (80/20 Split)
split_point = int(len(df)* 0.8)
split_point

# Slicing the shuffled indices
train_idx = indices[:split_point]
test_idx = indices[split_point:]

# Creating the subsets
train_set = df.loc[train_idx]
test_set = df.loc[test_idx]

# 4. Bias Check (The Delta)
train_surv = train_set['survived'].mean()
test_surv = test_set['survived'].mean()
delta = abs(train_surv - test_surv)

print(f"Train Survival Rate: {train_surv:.4f}")
print(f"Test Survival Rate:  {test_surv:.4f}")
print(f"Sampling Bias (Delta): {delta:.4f}")

Train Survival Rate: 0.3736
Test Survival Rate:  0.4246
Sampling Bias (Delta): 0.0510


In [13]:
df.corr(numeric_only=True)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,-0.55708,-0.203367
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.094035,0.135207
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,0.280328,0.19827
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.584471
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.349943,-0.583398
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.182024,-0.271832
adult_male,-0.55708,0.094035,0.280328,-0.253586,-0.349943,-0.182024,1.0,0.404744
alone,-0.203367,0.135207,0.19827,-0.584471,-0.583398,-0.271832,0.404744,1.0


In [15]:
from sklearn.model_selection import train_test_split

# Stratify by 'pclass' ensures the distribution of classes is identical
X_train, X_test = train_test_split(df, test_size=0.2,stratify=df['pclass'], random_state=42)

print("\n--- Stratified Split ---")
print("Train Class Dist:\n", X_train['pclass'].value_counts(normalize=True))
print("Test Class Dist:\n", X_test['pclass'].value_counts(normalize=True))


--- Stratified Split ---
Train Class Dist:
 pclass
3    0.550562
1    0.242978
2    0.206461
Name: proportion, dtype: float64
Test Class Dist:
 pclass
3    0.553073
1    0.240223
2    0.206704
Name: proportion, dtype: float64


In [17]:
from scipy.stats import chisquare

# 1) Define observed and expected arrays
observed = [450, 550]  # [Control, Treatment]
expected = [500, 500]  # planned 50/50 split out of 1000

# 2) Calculate Chi-Square statistic and p-value
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print(f"Chi-square statistic: {chi2_stat:.6f}")
print(f"p-value: {p_value:.6f}")

# 3) Print conclusion based on threshold
if p_value < 0.01:
    print("CRITICAL FAILURE: Sample Ratio Mismatch (SRM) Detected. Check Load Balancer.")
else:
    print("Variance is within natural limits.")


Chi-square statistic: 10.000000
p-value: 0.001565
CRITICAL FAILURE: Sample Ratio Mismatch (SRM) Detected. Check Load Balancer.
