
# 10 — Model-Aware Pipeline (No Leakage) with CV (8–9 min)

Train a small model predicting `Income` from `Age/ Educ/ Region` with proper imputers inside a pipeline.
Compare a (bad) leaky approach vs a (good) pipeline approach.


In [None]:

import numpy as np, pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge

np.random.seed(2025)

# Create a slightly larger dataset
n = 120
age = np.random.randint(20, 66, size=n)
educ = np.random.choice([12,14,16,18], size=n, p=[0.3,0.3,0.3,0.1])
region = np.random.choice(["North","South","East","West"], size=n)
income = 10 + 0.9*age + 0.8*(educ-12) + np.where(region=="North", 5, 0) + np.random.normal(0,6,size=n)

df = pd.DataFrame({"Age":age, "Educ":educ, "Region":region, "Income":income})

# Induce MAR: missing income for Age<28 or Region=='South' (random subset)
mask_mar = ((df["Age"]<28) | (df["Region"]=="South")) & (np.random.rand(n) < 0.35)
df.loc[mask_mar, "Income"] = np.nan

# Split features/target
X = df[["Age","Educ","Region"]]
y = df["Income"]

# BAD: fit imputer globally (leakage)
X_bad = X.copy()
X_bad.loc[:,["Age","Educ"]] = X_bad[["Age","Educ"]].mask(np.random.rand(n,2)<0.1)  # add some feature NAs
global_imputer = SimpleImputer(strategy="median").fit(X_bad[["Age","Educ"]])
X_bad[["Age","Educ"]] = global_imputer.transform(X_bad[["Age","Educ"]])
X_bad = pd.get_dummies(X_bad, columns=["Region"], drop_first=True)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
bad_score = cross_val_score(Ridge(alpha=1.0), X_bad, y.fillna(y.mean()), cv=kf, scoring="neg_mean_absolute_error").mean()

# GOOD: pipeline (no leakage). Impute + scale + encode within folds.
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
])
ct = ColumnTransformer([
    ("num", num_pipe, ["Age","Educ"]),
    ("cat", cat_pipe, ["Region"])
])

pipe = Pipeline([
    ("prep", ct),
    ("model", Ridge(alpha=1.0))
])

good_score = cross_val_score(pipe, X, y, cv=kf, scoring="neg_mean_absolute_error").mean()

print(f"Leaky baseline (higher is better since it's negative MAE): {bad_score:.3f}")
print(f"Pipeline (no leakage): {good_score:.3f}")

# ---- Classwork ----
# A) Replace Ridge with RandomForestRegressor; observe impact and variance across folds.
# B) Move imputation OUTSIDE the pipeline intentionally and re-run to see leakage effects (score inflation).
# C) Add a 'Income_missing' indicator to features and re-evaluate via pipeline.
