
# 05 — Regression Imputation + Stochastic Residuals 

Fit Income ~ Age + Educ on complete rows. Impute missing with prediction + random residual draw to preserve variance.


In [None]:

import numpy as np, pandas as pd
from sklearn.linear_model import LinearRegression

np.random.seed(13)

df = pd.DataFrame({
    "ID": range(1,11),
    "Age":  [22,25,29,35,40,45,50,54,60,65],
    "Educ": [12,12,14,16,16,18,16,14,12,12],
    "Income":[25,28,32,40,48,55,62,60,50,45]
})

truth = df.copy()
missing_ids = [1,2,3,8]
df.loc[df["ID"].isin(missing_ids), "Income"] = np.nan

train = df.dropna(subset=["Income"])
Xtr = train[["Age","Educ"]].values
ytr = train["Income"].values

lr = LinearRegression().fit(Xtr, ytr)
pred = lr.predict(df[["Age","Educ"]].values)

resid = ytr - lr.predict(Xtr)
sigma = np.std(resid, ddof=1)

imputed = df["Income"].copy()
for i, isna in enumerate(df["Income"].isna().values):
    if isna:
        imputed.iat[i] = pred[i] + np.random.normal(0, sigma)

df["Income_imp"] = imputed

print("Coeffs:", lr.intercept_, lr.coef_)
print(df[["ID","Age","Educ","Income","Income_imp"]].round(2))

# ---- Classwork ----
# 1) Repeat imputation 50 times (different seeds) and compute pooled mean and variance of Income (Rubin-style intuition).
# 2) Compare single deterministic regression vs stochastic regression imputation on SD preservation.
