
# 02 — Missingness Mechanisms: MCAR, MAR, MNAR 

We simulate missingness on the toy table (Age, Educ, Income) and compute the bias of complete-case mean for Income.


In [None]:

import numpy as np
import pandas as pd

np.random.seed(42)

# Toy dataset (truth known)
df = pd.DataFrame({
    "ID": range(1,11),
    "Age":  [22,25,29,35,40,45,50,54,60,65],
    "Educ": [12,12,14,16,16,18,16,14,12,12],
    "Income":[25,28,32,40,48,55,62,60,50,45]
})

true_mean = df["Income"].mean()
true_sd = df["Income"].std(ddof=1)
print(f"Truth — mean={true_mean:.1f}, sd={true_sd:.1f}")

def mcar(df, p=0.3):
    m = df.copy()
    mask = np.random.rand(len(m)) < p
    m.loc[mask,"Income"] = np.nan
    return m, mask

def mar(df):
    m = df.copy()
    # higher missingness for Age < 30 + one older
    missing_ids = [1,2,3,8]
    m.loc[m["ID"].isin(missing_ids), "Income"] = np.nan
    return m, m["ID"].isin(missing_ids).values

def mnar(df):
    m = df.copy()
    # low incomes self-suppress + one mid (ID 4)
    missing_ids = [1,2,3,4]
    m.loc[m["ID"].isin(missing_ids), "Income"] = np.nan
    return m, m["ID"].isin(missing_ids).values

for name, gen in [("MCAR", mcar), ("MAR", mar), ("MNAR", mnar)]:
    m, mask = gen(df)
    cc = m.dropna(subset=["Income"])
    est = cc["Income"].mean()
    bias = est - true_mean
    print(f"{name}: complete-case mean={est:.1f}, bias={bias:+.1f}, n_drop={mask.sum()}")

# ---- Classwork ----
# A) Change MCAR drop rate p to 0.1, 0.3, 0.6; record bias distribution over 200 runs.
# B) For MAR, compute correlation between missingness and Age; explain why deletion is biased.
# C) For MNAR, propose a sensitivity delta (Δ) that adjusts imputed low incomes downward and re-estimate the mean.
