
# 06 — Multiple Imputation by Chained Equations (MICE) with IterativeImputer 

Run `m=5` multiple imputations using scikit-learn's `IterativeImputer`, then combine estimates with simple pooling
(average of means, and total variance = within + between).


In [None]:

import numpy as np, pandas as pd
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

np.random.seed(7)

df = pd.DataFrame({
    "ID": range(1,11),
    "Age":  [22,25,29,35,40,45,50,54,60,65],
    "Educ": [12,12,14,16,16,18,16,14,12,12],
    "Income":[25,28,32,40,48,55,62,60,50,45]
})
truth_mean = df["Income"].mean()

# MAR missing
missing_ids = [1,2,3,8]
df.loc[df["ID"].isin(missing_ids), "Income"] = np.nan

M = 5
means = []
variances = []

for m in range(M):
    imp = IterativeImputer(random_state=100+m, sample_posterior=True, max_iter=10)
    X = df[["Age","Educ","Income"]].values
    Xi = imp.fit_transform(X)
    income_i = Xi[:,2]
    means.append(income_i.mean())
    variances.append(income_i.var(ddof=1))

pooled_mean = np.mean(means)
W = np.mean(variances)  # within
B = np.var(means, ddof=1)  # between
T = W + (1 + 1/M)*B      # total variance (Rubin-style)

print(f"Truth mean={truth_mean:.2f}")
print(f"Pooled mean={pooled_mean:.2f}")
print(f"Within={W:.2f}, Between={B:.4f}, Total={T:.2f}")

# ---- Classwork ----
# A) Change M to 10. How do Between and Total variance change?
# B) Turn off sample_posterior and observe variance collapse. Explain why.
# C) Add 'Age^2' as a predictor in the imputation model and re-run.
