### Normalize , find CR , Define weak features


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1. LOAD THE DATA (The step that was missing)
# Adjust the path if your folder name is different (e.g. 'processed' vs 'Splitted')
TRAIN_X_PATH = "../data/Splitted/X_train.csv"
TRAIN_Y_PATH = "../data/Splitted/y_train.csv"
TEST_X_PATH =  "../data/Splitted/X_test.csv"
TEST_Y_PATH =  "../data/Splitted/y_test.csv"

# Load and flatten target variables (ravel)
X_train = pd.read_csv(TRAIN_X_PATH)
y_train = pd.read_csv(TRAIN_Y_PATH).values.ravel()
X_test = pd.read_csv(TEST_X_PATH)
y_test = pd.read_csv(TEST_Y_PATH).values.ravel()

print("✅ Data Loaded Successfully.")
print(f"Train Shape: {X_train.shape}")

# ---------------------------------------------------------
# NOW APPLY THE "SENIOR'S ADVICE" (Normalize & Correlate)
# ---------------------------------------------------------

# 2. Identify Numeric Columns
# We normalize ONLY continuous values, not binary/dummy variables
numeric_cols = [
    'age', 'household_size', 'living_children',
    'bmi', 'time_to_water', 'altitude'
]
# Safety filter: ensure they actually exist in your dataframe
numeric_cols = [c for c in numeric_cols if c in X_train.columns]

# 3. Apply Standard Scaling
scaler = StandardScaler()

# Create copies so we don't mess up the original X_train
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("✅ Data Normalized (Scaled).")

# 4. Check Correlations (To see what to drop)
# Temporarily attach target to see correlations
train_full = X_train_scaled.copy()
train_full['TARGET'] = y_train

correlations = train_full.corr()['TARGET'].sort_values(ascending=False)

# Define weak features (Correlation extremely close to zero)
threshold = 0.005
weak_features = correlations[abs(correlations) < threshold].index.tolist()

print(f"\nPotential Weak Features (Corr < {threshold}):")
print(weak_features)

✅ Data Loaded Successfully.
Train Shape: (4564, 96)
✅ Data Normalized (Scaled).

Potential Weak Features (Corr < 0.005):
['floor_material_21.0', 'cooking_fuel_5.0', 'region_5.0', 'water_source_51.0', 'cooking_fuel_4.0', 'roof_material_32.0', 'water_source_32.0', 'household_size', 'cooking_fuel_96.0']


### Surgical Dropping the true noise & Retraining

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, classification_report

# 1. Define the specific list of Noise Features to drop
# We EXCLUDE household_size and region from the drop list
drop_list = [
    'floor_material_21.0', 
    'cooking_fuel_5.0', 
    'water_source_51.0', 
    'cooking_fuel_4.0', 
    'roof_material_32.0', 
    'water_source_32.0', 
    'cooking_fuel_96.0'
]

# 2. Create Refined Datasets (Drop specific columns)
# Ensure we only drop what actually exists (safety check)
drop_actual = [c for c in drop_list if c in X_train_scaled.columns]

X_train_refined = X_train_scaled.drop(columns=drop_actual)
X_test_refined = X_test_scaled.drop(columns=drop_actual)

print(f"Dropped {len(drop_actual)} noise features.")
print(f"New Shape: {X_train_refined.shape}")

# 3. RETRAIN Random Forest on Refined Data
# We use the 'Aggressive' parameters from before (or close to them)
print("\nTraining Refined Random Forest...")
rf_refined = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,       # Allow full growth
    min_samples_leaf=2,   # Slight constraint
    max_features='sqrt',  # Standard
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_refined.fit(X_train_refined, y_train)

# 4. Evaluate
y_prob_refined = rf_refined.predict_proba(X_test_refined)[:, 1]
auc_refined = roc_auc_score(y_test, y_prob_refined)

print(f"--- REFINED RESULTS ---")
print(f"Old ROC-AUC: 0.5974")
print(f"New ROC-AUC: {auc_refined:.4f}")

if auc_refined > 0.5974:
    print("✅ Improvement! The noise removal helped.")
else:
    print("⚖️ No significant change (or slight drop). The features were harmless.")

Dropped 7 noise features.
New Shape: (4564, 89)

Training Refined Random Forest...
--- REFINED RESULTS ---
Old ROC-AUC: 0.5974
New ROC-AUC: 0.5924
⚖️ No significant change (or slight drop). The features were harmless.


### Execute Feature Engineering

In [7]:
# 1. Define a function to engineer features
# This ensures we apply the EXACT same logic to Train and Test
def engineer_features(df_input):
    df = df_input.copy()
    
    # --- A. Medical Risk Flags ---
    # Underweight Flag (BMI < 18.5 is a known anemia driver)
    # Note: 'bmi' is scaled, so we must assume ~18.5 maps to a certain scaled value. 
    # OR, simpler: We create interactions based on the normalized distribution.
    # Since we scaled data, we can't use "18.5" directly easily. 
    # Strategy: We will use Quantile interactions instead which are robust.
    
    # --- B. Socioeconomic Interactions (The "Poverty Trap") ---
    # Interaction: Wealth * Household Size
    # Rationale: Large families with low wealth spread nutrition thinner.
    if 'wealth_index' in df.columns and 'household_size' in df.columns:
        df['interact_wealth_hsize'] = df['wealth_index'] * df['household_size']

    # --- C. Geographic-Economic Interaction ---
    # Interaction: Rural (residence_2) * Wealth
    # Note: Check your specific column name for Rural (e.g., 'residence_2' or similar)
    # We'll try to find the 'Rural' dummy column dynamically
    rural_col = [c for c in df.columns if 'residence' in c and '2' in c] # Usually residence_2
    if rural_col and 'wealth_index' in df.columns:
        df['interact_rural_wealth'] = df[rural_col[0]] * df['wealth_index']

    # --- D. Age-Based Risk ---
    # Adolescent Risk (Younger age often higher risk)
    # Since Age is scaled, lower values = younger.
    if 'age' in df.columns:
        df['age_squared'] = df['age'] ** 2  # Capture non-linear age effects (U-shaped risk)

    return df

# 2. Apply to X_train and X_test
print("Engineering new features...")
X_train_eng = engineer_features(X_train_scaled) # Use the scaled version you have
X_test_eng  = engineer_features(X_test_scaled)

print(f"New Feature Count: {X_train_eng.shape[1]}")
print("Added: interact_wealth_hsize, interact_rural_wealth, age_squared")

Engineering new features...
New Feature Count: 99
Added: interact_wealth_hsize, interact_rural_wealth, age_squared


### Retrain Random Forest

In [8]:
# 3. Retrain RF on Engineered Data
print("\nTraining Random Forest on Engineered Features...")

rf_eng = RandomForestClassifier(
    n_estimators=500,       # More trees to stabilize new features
    max_depth=None,         # Let it find the deep interactions
    min_samples_leaf=2,     # Keep it slightly flexible
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_eng.fit(X_train_eng, y_train)

# 4. Evaluate
y_prob_eng = rf_eng.predict_proba(X_test_eng)[:, 1]
auc_eng = roc_auc_score(y_test, y_prob_eng)
prauc_eng = average_precision_score(y_test, y_prob_eng)

print(f"--- ENGINEERED RESULTS ---")
print(f"Baseline ROC-AUC: 0.5974")
print(f"New ROC-AUC:      {auc_eng:.4f}")
print(f"New PR-AUC:       {prauc_eng:.4f}")

# Check if the new features are actually being used
importances = pd.Series(rf_eng.feature_importances_, index=X_train_eng.columns)
print("\nTop 5 Features in New Model:")
print(importances.sort_values(ascending=False).head(5))


Training Random Forest on Engineered Features...
--- ENGINEERED RESULTS ---
Baseline ROC-AUC: 0.5974
New ROC-AUC:      0.5888
New PR-AUC:       0.4945

Top 5 Features in New Model:
bmi                      0.098374
altitude                 0.074589
age_squared              0.067918
age                      0.067001
interact_wealth_hsize    0.055534
dtype: float64
