### What this demonstrates

This code proves we have built a `Smart Diagnostic System`, not just a "model."

1. It intelligently profiles the patient first.

2. It selects the mathematically superior tool for that specific profile.

3. It outputs a precision score.
   
This is the perfect conclusion to our research project.

NOTE: This is for "Demo" purpose only. In real we should feed the real test data which model has never seen before

### 1. SYSTEM SETUP

In [1]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
# Ensure utils is accessible
from utils import * 

# Path Setup
dataset_dir = "..//dataset//modified"
path_train_clustered = Path(dataset_dir) / "train_with_clusters.csv"
path_train_raw = Path(dataset_dir) / "train.csv"
path_test_raw = Path(dataset_dir) / "test.csv" # The unseen data

# Model Paths
path_strategy_a = Path("..//models//strategyA") # Specialists
path_strategy_b = Path("..//models//strategyB") # Global Meta
path_strategy_c = Path("..//models//strategyC") # Oversampled

print("Inference System Initialized.")

import warnings

# --- Suppress Noise ---
# Ignore FutureWarnings to keep the loop output clean
warnings.simplefilter(action='ignore', category=FutureWarning)
# Opt-in to future pandas behavior to silence specific downcasting warnings
pd.set_option('future.no_silent_downcasting', True)

Inference System Initialized.


In [2]:
# Use CategorySafetySanitizer to prevent "Categorical" error
class CategorySafetySanitizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        # Find all columns that are strictly 'category' type
        cat_cols = X_copy.select_dtypes(include=['category']).columns
        # Convert them to 'object' (string) to allow filling new values
        if len(cat_cols) > 0:
            X_copy[cat_cols] = X_copy[cat_cols].astype('object')
        return X_copy

### 2. BUILD THE ROUTER (CENTROIDS) AND RECONCILE COLUMNS

In [3]:
print("Constructing Routing Logic and Reconciling Columns...")

# 1. Load RAW Training Data 
df_train_raw = pd.read_csv(path_train_raw)

# 2. Re-create the original preprocessing pipeline components
SPECIFIC_COL_TO_DROP_ROUTING = ['Year'] 
THRESHOLD_RATIO_ROUTING = 0.1
MAX_UNIQUE_ROUTING = 50
THRESHOLD_MISSING_ROUTING = 70.0
NUM_STRATEGY_ROUTING = "median"
CAT_STRATEGY_ROUTING = "mode"
THRESHOLD_QUASI_CONSTANT_ROUTING = 0.00000001

COLS_TO_DROP_ROUTING = [
    'UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 
    'pem_def_mar_12', 'insur_private_03', 'insur_private_12', 'insur_other_03', 
    'insur_other_12', 'seg_pop_12', 'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12'
]

data_type_conversion_routing = Pipeline([
    ('specific_categorizer', SpecificColumnCategorizer(columns_to_categorize=SPECIFIC_COL_TO_DROP_ROUTING)),
    ('object_to_category', ObjectToCategoryTransformer(threshold_ratio=THRESHOLD_RATIO_ROUTING, max_unique=MAX_UNIQUE_ROUTING)),
    ('float_to_category', FloatToCategoryTransformer()),
])
dropColumns_routing = Pipeline([
    ('drop_columns', ColumnDropper(columns_to_drop=COLS_TO_DROP_ROUTING)),
])
dropColumnsHighNA_routing = Pipeline([
    ('drop_columns_high_na', DropColumnsHighNA(threshold=THRESHOLD_MISSING_ROUTING)),
])
missingValueImputer_routing = Pipeline([
    ('missing_value_imputer', MissingValueImputer(num_strategy=NUM_STRATEGY_ROUTING, cat_strategy=CAT_STRATEGY_ROUTING)),
])
identifyAndDropLowVarNum_routing = Pipeline([
    ('identify_and_drop_low_var_num', IdentifyAndDropLowVarNum(quasi_constant_threshold=THRESHOLD_QUASI_CONSTANT_ROUTING)),
])

# --- Master Routing Pipeline (WITH SAFETY FIX) ---
routing_pipeline = Pipeline([
    ('1_data_type_conversion', data_type_conversion_routing),
    ('2_drop_columns', dropColumns_routing),
    ('3_drop_high_na_columns', dropColumnsHighNA_routing),
    # SAFETY FIX INSERTED HERE
    ('3.5_category_safety', CategorySafetySanitizer()),
    ('4_impute_missing_values', missingValueImputer_routing),
    ('5_identify_and_drop_low_var_num', identifyAndDropLowVarNum_routing),
])

# 3. Fit the pipeline on the RAW TRAINING data
print("Fitting the routing pipeline on raw training data...")
routing_pipeline.fit(df_train_raw)

# 4. Get the exact columns K-Means used
# We transform a dummy row to see what columns remain after dropping/encoding
dummy_transformed = routing_pipeline.transform(df_train_raw.iloc[:5])
routing_cols = dummy_transformed.columns.tolist()

# Remove target and non-numeric columns if any slipped through
routing_cols = [c for c in routing_cols if c not in ['composite_score', 'cluster', 'Year', 'PredictionYear']]

print(f"Pipeline fitted. Routing based on {len(routing_cols)} columns.")

# 5. Calculate Centroids
# We need the clustered file just to get the cluster labels
df_clustered_labels = pd.read_csv(path_train_clustered)
df_clustered_labels = df_clustered_labels[df_clustered_labels['cluster'] != 4] # Drop outlier

# Transform the raw training data
print("Transforming full training set...")
X_train_routing = routing_pipeline.transform(df_train_raw)

# --- FILTER FOR NUMERIC COLUMNS ONLY ---
# We must only calculate centroids on numbers.
# Identify columns that are strictly numeric
numeric_routing_cols = X_train_routing.select_dtypes(include=['number']).columns.tolist()

# Ensure we remove metadata/leakage from this list too
numeric_routing_cols = [c for c in numeric_routing_cols if c not in ['composite_score', 'cluster', 'Year', 'PredictionYear']]

print(f"Calculating centroids on {len(numeric_routing_cols)} numeric columns...")

# Align the data
X_train_routing = X_train_routing.loc[df_clustered_labels.index]
X_train_routing['cluster'] = df_clustered_labels['cluster']

# Calculate Centroids
cluster_centroids = X_train_routing.groupby('cluster')[numeric_routing_cols].mean()

print("Router Logic Built.")
display(cluster_centroids.head())

# UPDATE THE GLOBAL ROUTING_COLS VARIABLE
# We need this updated list for the inference loop later
routing_cols = numeric_routing_cols

Constructing Routing Logic and Reconciling Columns...
Fitting the routing pipeline on raw training data...
SpecificColumnCategorizer: Converted 'Year' to category.
ObjectToCategoryTransformer: Converted 'Age_03' to category.
ObjectToCategoryTransformer: Converted 'Urban_03' to category.
ObjectToCategoryTransformer: Converted 'Married_03' to category.
ObjectToCategoryTransformer: Converted 'Education_03' to category.
ObjectToCategoryTransformer: Converted 'Num_Living_Child_03' to category.
ObjectToCategoryTransformer: Converted 'GlobalHealth_03' to category.
ObjectToCategoryTransformer: Converted 'BMI_03' to category.
ObjectToCategoryTransformer: Converted 'FamilyDecisions_03' to category.
ObjectToCategoryTransformer: Converted 'Employment_03' to category.
ObjectToCategoryTransformer: Converted 'Age_12' to category.
ObjectToCategoryTransformer: Converted 'Urban_12' to category.
ObjectToCategoryTransformer: Converted 'Married_12' to category.
ObjectToCategoryTransformer: Converted 'Educa

Unnamed: 0_level_0,Marriages_03,Num_ADL_03,Num_IADL_03,Num_CES-D_Symptoms_03,Num_Illnesses_03,Marriages_12,Num_ADL_12,Num_IADL_12,Num_CES-D_Symptoms_12,Num_Illnesses_12,...,JobHrsWeekly_12,Earnings_12,SpouseEarnings_12,hincome_12,hinc_business_12,hinc_rent_12,hinc_assets_12,hinc_cap_12,Pension_12,SpousePension_12
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.023006,0.222393,0.134969,6.179448,1.630368,1.069018,0.661043,0.400307,6.118098,1.588957,...,36.866564,1549.079755,8496.932515,40552.147239,10184.04908,46.01227,343.558282,10552.147239,8496.932515,6595.092025
1,1.087719,0.017544,0.035088,2.701754,0.929825,1.052632,0.035088,0.052632,2.491228,1.0,...,34.491228,18596.491228,3508.77193,152631.578947,74035.087719,30877.192982,719.298246,105438.596491,12982.45614,6666.666667
2,2.098266,0.028902,0.028902,3.098266,0.921965,2.468208,0.072254,0.040462,2.976879,0.965318,...,38.020231,6445.086705,7630.057803,77803.468208,22167.630058,-895.953757,1627.16763,22890.17341,16242.774566,7514.450867
3,0.978937,0.03009,0.018054,2.814443,0.987964,0.988967,0.080241,0.072217,2.595787,1.045135,...,31.654965,461.384152,5155.466399,62647.943831,16298.89669,20.060181,1293.881645,17642.928786,9869.608826,16389.167503
5,0.980861,0.021531,0.007177,2.270335,0.462919,0.998804,0.058612,0.041866,2.264354,0.588517,...,39.692584,19533.492823,7248.803828,80083.732057,15155.502392,358.851675,241.626794,15741.626794,26973.684211,3911.483254


### 3. LOAD THE HYBRID MODEL FLEET

In [4]:
print("Loading Hybrid Model Fleet...")

print("Loading Models...")
models = {}

# Strategy A (Specialists) for Clusters 2, 3, 5
for c_id in [2, 3, 5]:
    models[c_id] = joblib.load(path_strategy_a / f"specialist_model_cluster_{c_id}.pkl")

# Strategy B (Global Meta) for Cluster 1
models[1] = joblib.load(path_strategy_b / "global_meta_feature_model.pkl")

# Strategy C (Oversampled) for Cluster 0
models[0] = joblib.load(path_strategy_c / "strategy_c_oversampled_model.pkl")

print("✅ All Models Loaded.")

Loading Hybrid Model Fleet...
Loading Models...
✅ All Models Loaded.


### 4. PREPARE THE TEST DATA

In [5]:
print("Reconstructing the Preprocessing Pipeline...")

# 1. Load RAW Data
df_train_raw = pd.read_csv(path_train_raw)
df_test_raw = pd.read_csv(path_test_raw)

# 2. Pipeline Configuration (Same as before)
SPECIFIC_COL_TO_DROP = ['Year']
THRESHOLD_RATIO = 0.1
MAX_UNIQUE = 50
THRESHOLD_MISSING = 70.0
NUM_STRATEGY = "median"
CAT_STRATEGY = "mode"
THRESHOLD_QUASI_CONSTANT = 0.00000001

COLS_TO_DROP = [
    'UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 
    'pem_def_mar_12', 'insur_private_03', 'insur_private_12', 'insur_other_03', 
    'insur_other_12', 'seg_pop_12', 'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12'
]

# 3. Build Pipeline
data_type_conversion = Pipeline([
    ('specific_categorizer', SpecificColumnCategorizer(columns_to_categorize=SPECIFIC_COL_TO_DROP)),
    ('object_to_category', ObjectToCategoryTransformer(threshold_ratio=THRESHOLD_RATIO, max_unique=MAX_UNIQUE)),
    ('float_to_category', FloatToCategoryTransformer()),
])

preprocessing_pipeline = Pipeline([
    ('1_data_type_conversion', data_type_conversion),
    ('2_drop_columns', ColumnDropper(columns_to_drop=COLS_TO_DROP)),
    ('3_drop_high_na_columns', DropColumnsHighNA(threshold=THRESHOLD_MISSING)),
    ('3.5_category_safety', CategorySafetySanitizer()), # <--- The Safety Fix
    ('4_impute_missing_values', MissingValueImputer(num_strategy=NUM_STRATEGY, cat_strategy=CAT_STRATEGY)),
    ('5_identify_and_drop_low_var_num', IdentifyAndDropLowVarNum(quasi_constant_threshold=THRESHOLD_QUASI_CONSTANT)),
])

# 4. Fit & Transform
print("Fitting pipeline on Raw Training Data...")
preprocessing_pipeline.fit(df_train_raw)

print("Transforming Raw Test Data...")
df_test_cleaned = preprocessing_pipeline.transform(df_test_raw)

# 5. Final Cleanup
# The pipeline turns 'Year' to category, but models expect it gone.
leakage_cols = ['Year', 'PredictionYear']
cols_to_remove = [c for c in df_test_cleaned.columns if any(x in c for x in leakage_cols)]

if cols_to_remove:
    df_test_cleaned = df_test_cleaned.drop(cols_to_remove, axis=1)

print(f"✅ Test Data Cleaned. Shape: {df_test_cleaned.shape}")

Reconstructing the Preprocessing Pipeline...
Fitting pipeline on Raw Training Data...
SpecificColumnCategorizer: Converted 'Year' to category.
ObjectToCategoryTransformer: Converted 'Age_03' to category.
ObjectToCategoryTransformer: Converted 'Urban_03' to category.
ObjectToCategoryTransformer: Converted 'Married_03' to category.
ObjectToCategoryTransformer: Converted 'Education_03' to category.
ObjectToCategoryTransformer: Converted 'Num_Living_Child_03' to category.
ObjectToCategoryTransformer: Converted 'GlobalHealth_03' to category.
ObjectToCategoryTransformer: Converted 'BMI_03' to category.
ObjectToCategoryTransformer: Converted 'FamilyDecisions_03' to category.
ObjectToCategoryTransformer: Converted 'Employment_03' to category.
ObjectToCategoryTransformer: Converted 'Age_12' to category.
ObjectToCategoryTransformer: Converted 'Urban_12' to category.
ObjectToCategoryTransformer: Converted 'Married_12' to category.
ObjectToCategoryTransformer: Converted 'Education_12' to category.

### 5. RUN THE HYBRID INFERENCE LOOP

In [6]:
# import warnings

# # --- Suppress Noise ---
# # Ignore FutureWarnings to keep the loop output clean
# warnings.simplefilter(action='ignore', category=FutureWarning)
# # Opt-in to future pandas behavior to silence specific downcasting warnings
# pd.set_option('future.no_silent_downcasting', True) 

# print("--- Running Hybrid Inference on Test Set ---")

# 1. Transform Test Data for ROUTING
# This ensures K-Means gets numbers, even if the cleaned data has strings
print("Preparing Test Data for Router...")
X_test_routing = routing_pipeline.transform(df_test_raw)

# Enforce numeric types for router to prevent "str - float" errors
for col in routing_cols:
    X_test_routing[col] = pd.to_numeric(X_test_routing[col], errors='coerce')
X_test_routing = X_test_routing.fillna(0)

# 2. Setup Loop
predictions = []
actuals = []
routing_log = []

for i, (index, row) in enumerate(df_test_raw.iterrows()):
    
    # --- STEP 1: ROUTING ---
    # Use the numeric vector for distance calculation
    # .infer_objects(copy=False) is added to handle the pandas deprecation warning explicitly
    patient_vector = X_test_routing.iloc[i].reindex(routing_cols).fillna(0).infer_objects(copy=False)
    
    distances = {}
    for c_id, centroid in cluster_centroids.iterrows():
        dist = np.linalg.norm(patient_vector.values - centroid.values)
        distances[c_id] = dist
    
    assigned_cluster = min(distances, key=distances.get)
    
    # --- STEP 2: PREDICTION ---
    model = models[assigned_cluster]
    
    # Use the CLEANED data for the model (df_test_cleaned)
    # This matches the structure of 'train_with_clusters.csv'
    input_data = pd.DataFrame([df_test_cleaned.iloc[i]])
    
    # Handle Target dropping
    if 'composite_score' in input_data.columns:
        actual_score = input_data['composite_score'].values[0]
        input_data = input_data.drop('composite_score', axis=1)
        actuals.append(actual_score)
    else:
        actuals.append(row.get('composite_score', np.nan))
    
    # Strategy B Handling (Cluster 1)
    if assigned_cluster == 1:
        input_data['cluster'] = str(assigned_cluster)
    elif 'cluster' in input_data.columns:
        input_data = input_data.drop('cluster', axis=1)
        
    try:
        pred = model.predict(input_data)[0]
        predictions.append(pred)
        routing_log.append(assigned_cluster)
    except Exception as e:
        # Debugging info if it still fails
        # print(f"Error row {i} (Cluster {assigned_cluster}): {e}") 
        predictions.append(np.nan)
        routing_log.append(assigned_cluster)

# Add results
results_df = pd.DataFrame({
    'Actual': actuals,
    'Predicted': predictions,
    'Assigned_Cluster': routing_log
})

print("Inference Complete.")
display(results_df.head(10))

Preparing Test Data for Router...
SpecificColumnCategorizer: Converted 'Year' to category.
ObjectToCategoryTransformer: Converted 'Age_03' to category.
ObjectToCategoryTransformer: Converted 'Urban_03' to category.
ObjectToCategoryTransformer: Converted 'Married_03' to category.
ObjectToCategoryTransformer: Converted 'Education_03' to category.
ObjectToCategoryTransformer: Converted 'Num_Living_Child_03' to category.
ObjectToCategoryTransformer: Converted 'GlobalHealth_03' to category.
ObjectToCategoryTransformer: Converted 'BMI_03' to category.
ObjectToCategoryTransformer: Converted 'FamilyDecisions_03' to category.
ObjectToCategoryTransformer: Converted 'Employment_03' to category.
ObjectToCategoryTransformer: Converted 'Age_12' to category.
ObjectToCategoryTransformer: Converted 'Urban_12' to category.
ObjectToCategoryTransformer: Converted 'Married_12' to category.
ObjectToCategoryTransformer: Converted 'Education_12' to category.
ObjectToCategoryTransformer: Converted 'Num_Living_

Unnamed: 0,Actual,Predicted,Assigned_Cluster
0,246,213.355583,5
1,179,174.35698,0
2,124,80.651174,0
3,155,92.03115,0
4,117,164.315786,0
5,172,164.315786,0
6,236,142.303455,0
7,244,224.647917,1
8,179,224.647917,1
9,127,160.028176,0


### 6. FINAL SYSTEM EVALUATION

In [7]:
# 1. Clean Results (Drop rows where prediction failed/crashed)
results_clean = results_df.dropna()
n_dropped = len(results_df) - len(results_clean)

if len(results_clean) == 0:
    print("CRITICAL ERROR: No valid predictions were generated.")
else:
    if n_dropped > 0:
        print(f"Warning: Dropped {n_dropped} rows due to prediction errors.")

    # 2. Global Metrics
    mae = mean_absolute_error(results_clean['Actual'], results_clean['Predicted'])
    r2 = r2_score(results_clean['Actual'], results_clean['Predicted'])

    print("="*40)
    print(f"HYBRID SYSTEM PERFORMANCE (UNSEEN DATA)")
    print("="*40)
    print(f"Global MAE: {mae:.2f}")
    print(f"Global R2:  {r2:.3f}")

    # 3. Cluster-wise Breakdown
    print("\nPerformance by Assigned Cluster:")
    
    # --- Suppress DeprecationWarning for cleaner output ---
    # Pandas warns about groupby().apply() behavior changing in future versions
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=DeprecationWarning)
        
        cluster_stats = results_clean.groupby('Assigned_Cluster').apply(
            lambda x: pd.Series({
                'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
                'R2': r2_score(x['Actual'], x['Predicted']),
                'Count': len(x)
            })
        )
    
    # Formatting for cleaner display
    display(cluster_stats.style.background_gradient(cmap='RdYlGn_r', subset=['MAE']))

HYBRID SYSTEM PERFORMANCE (UNSEEN DATA)
Global MAE: 34.76
Global R2:  0.481

Performance by Assigned Cluster:


Unnamed: 0_level_0,MAE,R2,Count
Assigned_Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,33.434918,0.439724,512.0
1,34.228654,0.097694,15.0
2,41.35982,0.207079,44.0
3,41.744712,0.241281,50.0
5,35.150747,0.425681,125.0


In [8]:
# Reset warnings to default (good practice)
warnings.resetwarnings()