Import Libraries

In [None]:
import pandas as pd
import numpy as np
import joblib

from ucimlrepo import fetch_ucirepo
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer


Load Cleveland Dataset

In [None]:
def load_cleveland():
    heart = fetch_ucirepo(id=45)
    X = heart.data.features.copy()
    
    # Replace missing-value markers with NaN
    X = X.replace("?", np.nan)
    return X


Build Preprocessing Pipeline

In [None]:
def build_preprocessor(X):
    # Known categorical columns based on dataset documentation
    known_cat = {
        "sex", "cp", "fbs", "restecg",
        "exang", "slope", "thal", "ca"
    }

    X = X.copy()
    X.columns = X.columns.astype(str)

    cat_cols = [c for c in X.columns if c in known_cat]
    num_cols = [c for c in X.columns if c not in known_cat]

    # Ensure correct dtypes
    for c in num_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce")

    for c in cat_cols:
        X[c] = X[c].astype(object)

    # Numerical pipeline
    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", MinMaxScaler())
    ])

    # Categorical pipeline
    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ))
    ])

    # ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ]
    )

    return X, preprocessor


Fit Preprocessor

In [None]:
X = load_cleveland()
X, preprocessor = build_preprocessor(X)

# Ensure clean NaNs
X = X.replace({pd.NA: np.nan})

preprocessor.fit(X)


Save Preprocessor Artifact

In [None]:
joblib.dump(preprocessor, "artifacts/preprocessor.joblib")

print("âœ… Preprocessor rebuilt and saved successfully")
