In [1]:
# Load cleaned data + feature lists
from pathlib import Path
import pandas as pd
import joblib

# 1) Path to processed folder (relative to notebooks/)
proc_dir = Path("..") / "data" / "processed"

# 2) Load train / test sets
train_df = pd.read_parquet(proc_dir / "train.parquet")
test_df  = pd.read_parquet(proc_dir / "test.parquet")

# 3) Load saved feature lists (dict with 'num' and 'cat' keys)
feat_lists = joblib.load(proc_dir / "feature_lists.pkl")
num_keep, cat_keep = feat_lists["num"], feat_lists["cat"]

# 4) Print shapes for sanity-check
print(f"Train shape: {train_df.shape}")
print(f"Test  shape: {test_df.shape}")
print(f"Numeric features: {len(num_keep)}  |  Categorical features: {len(cat_keep)}")

Train shape: (413135, 99)
Test  shape: (82107, 99)
Numeric features: 15  |  Categorical features: 3


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Numeric preprocessing: impute → scale
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

# Categorical preprocessing: impute → one-hot
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe",     OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine and add classifier
preprocessor   = ColumnTransformer([
    ("num", num_pipe, num_keep),
    ("cat", cat_pipe, cat_keep)
])

model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier",   LogisticRegression(max_iter=1000, random_state=42))
])

# Save the pipeline skeleton
models_dir = Path("..") / "models"
models_dir.mkdir(exist_ok=True)
joblib.dump(model_pipeline, models_dir / "pipeline_skeleton.pkl")

['..\\models\\pipeline_skeleton.pkl']

## 📝 Defensive Pipeline Skeleton (with Imputation)

**Why keep imputers if the parquet files are already clean?**

> **Robustness:** Handles NaNs that may appear in new data (dashboard users, ETL drift, reruns). 
> **Clarity:** Makes preprocessing self-contained; future readers won’t wonder “where did NaNs go?” 

**Pipeline Overview:**

1. **Numeric features (15)**  
    • `SimpleImputer(median)` → `StandardScaler`

2. **Categorical features (3)**  
    • `SimpleImputer(most_frequent)` → `OneHotEncoder(ignore_unknown)`

3. **Classifier placeholder**  
    • `LogisticRegression(max_iter=1000, random_state=42)`

This skeleton is ready for `.fit()`, metric logging, grid search, and later swap-ins (e.g., XGBoost).