
# 03 — Simple Imputation: Mean/Median/Mode + Group-wise (6 min)

Demonstrate variance shrinkage and bias under MAR. Also add a missingness indicator column.


In [None]:

import numpy as np, pandas as pd
np.random.seed(0)

df = pd.DataFrame({
    "ID": range(1,11),
    "Age":  [22,25,29,35,40,45,50,54,60,65],
    "Educ": [12,12,14,16,16,18,16,14,12,12],
    "Income":[25,28,32,40,48,55,62,60,50,45]
})

true_mean = df["Income"].mean()

# Induce MAR missingness
missing_ids = [1,2,3,8]
df_mar = df.copy()
df_mar["Income_missing"] = 0
df_mar.loc[df_mar["ID"].isin(missing_ids), "Income"] = np.nan
df_mar.loc[df_mar["ID"].isin(missing_ids), "Income_missing"] = 1

cc_mean = df_mar["Income"].dropna().mean()
print(f"Complete-case mean (biased): {cc_mean:.1f} vs truth {true_mean:.1f}")

# Mean imputation (global)
imp_mean = df_mar.copy()
mean_val = imp_mean["Income"].mean()  # complete-case mean by default
imp_mean["Income"] = imp_mean["Income"].fillna(mean_val)

print(f"After mean impute: mean={imp_mean['Income'].mean():.1f}, sd={imp_mean['Income'].std(ddof=1):.2f}")

# Group-wise median by Educ
imp_grp = df_mar.copy()
grp_meds = imp_grp.groupby("Educ")["Income"].transform("median")
imp_grp["Income"] = imp_grp["Income"].fillna(grp_meds)
print(f"Group-wise median impute: mean={imp_grp['Income'].mean():.1f}, sd={imp_grp['Income'].std(ddof=1):.2f}")

# ---- Classwork ----
# 1) Add 'Income_missing' as a feature and fit a simple linear regression predicting Income (after imputation).
# 2) Compare global mean vs group-wise median vs "carry truth" (for IDs with known truth) to quantify bias and variance changes.
# 3) Explain why adding the missingness indicator may help under MAR.
