
# 04 — kNN Imputation (6 min)

Use scikit-learn's `KNNImputer` on the MAR setting. Tune `n_neighbors` and distance metric, evaluate imputation MAE.


In [None]:

import numpy as np, pandas as pd
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error

np.random.seed(1)

df = pd.DataFrame({
    "ID": range(1,11),
    "Age":  [22,25,29,35,40,45,50,54,60,65],
    "Educ": [12,12,14,16,16,18,16,14,12,12],
    "Income":[25,28,32,40,48,55,62,60,50,45]
})

# Keep ground truth for MAE
truth = df.copy()

# Induce MAR missingness
missing_ids = [1,2,3,8]
df.loc[df["ID"].isin(missing_ids), "Income"] = np.nan

# Use only Age, Educ, Income in the matrix
X = df[["Age","Educ","Income"]].values
imputer = KNNImputer(n_neighbors=3, weights="distance")
X_imp = imputer.fit_transform(X)
df_imp = pd.DataFrame(X_imp, columns=["Age","Educ","Income_imp"])

# Evaluate on missing positions
mask = df["Income"].isna().values
mae = mean_absolute_error(truth.loc[mask, "Income"], df_imp.loc[mask, "Income_imp"])
print(f"MAE on imputed Income (k=3): {mae:.2f}")
print(df_imp.round(2))

# ---- Classwork ----
# A) Try k = 1, 3, 5, 7. Report MAE for each and pick the best.
# B) Normalize features (Age, Educ) to comparable scales before kNN; does MAE improve?
# C) Add a synthetic feature (Age^2). Does it help or hurt kNN imputation?
