# 07 â€“ Final Stacking Model (Colab Ready)

This notebook implements the complete, optimized pipeline for the Loan Prediction task. It is designed to be run on Google Colab with the full dataset.

**Key Features:**
- **Target**: Predicting **Probability of Default** (1 = Default, 0 = Paid Back).
- **Clipping**: Handles extreme outliers in income and DTI.
- **Feature Engineering**: Adds `loan_to_income`, `monthly_debt`, `interest_burden`.
- **Hyperparameter Tuning**: Uses Optuna to find best params for XGBoost and LightGBM.
- **Stacking**: Combines XGBoost, LightGBM, CatBoost, and **MLPClassifier (Neural Network)** using a Logistic Regression meta-learner.

In [None]:
# Install necessary libraries
!pip install optuna catboost xgboost lightgbm scikit-learn pandas numpy

In [None]:
import pandas as pd
import numpy as np
import optuna
from functools import partial
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Assuming data is uploaded to Colab or mounted from Drive
# If using Colab upload, uncomment below:
# from google.colab import files
# uploaded = files.upload()

DATA_DIR = Path('data') # Adjust this path if needed for Colab (e.g., '/content/drive/MyDrive/...')
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'

print("Loading data...")
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
def feature_engineering(df):
    df = df.copy()
    # Clipping (99th percentile)
    for col in ['annual_income', 'debt_to_income_ratio']:
        if col in df.columns:
            limit = df[col].quantile(0.99)
            df[col] = df[col].clip(upper=limit)
        
    df['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)
    df['monthly_debt'] = (df['annual_income'] / 12) * df['debt_to_income_ratio']
    df['interest_burden'] = df['loan_amount'] * (df['interest_rate'] / 100)
    
    # Note: We decided NOT to include disposable_income and credit_income_interaction 
    # as they didn't improve the score in our tests.
    return df

print("Applying feature engineering...")
X_train_full = feature_engineering(train_df)
X_test_full = feature_engineering(test_df)

target_col = 'loan_paid_back'
drop_cols = [target_col, 'id']

# Drop target and id from training data
X = X_train_full.drop(columns=[c for c in drop_cols if c in X_train_full.columns])

# INVERT TARGET: 1 = Default, 0 = Paid Back
y = 1 - X_train_full[target_col]
print("Target inverted: predicting Probability of Default.")

# Drop id from test data (target doesn't exist there)
X_test = X_test_full.drop(columns=['id'], errors='ignore')

# Ensure columns match exactly
X_test = X_test[X.columns]

In [None]:
# Preprocessor for Tree Models (Ordinal Encoding for Grades)
def get_tree_preprocessor(X):
    log_features = ['annual_income']
    numeric_features = ['debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate',
                        'loan_to_income', 'monthly_debt', 'interest_burden']
    ordinal_features = ['grade_subgrade']
    categorical_features = ['loan_purpose', 'gender', 'marital_status', 'education_level', 'employment_status']
    
    log_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('log', FunctionTransformer(np.log1p, validate=False)),
        ('scaler', StandardScaler())
    ])
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    grades = sorted(X['grade_subgrade'].unique())
    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(categories=[grades], handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('log', log_transformer, log_features),
            ('num', numeric_transformer, numeric_features),
            ('ord', ordinal_transformer, ordinal_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    return preprocessor

# Preprocessor for Neural Network (One-Hot Encoding for EVERYTHING, Standard Scaling)
def get_nn_preprocessor(X):
    log_features = ['annual_income']
    numeric_features = ['debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate',
                        'loan_to_income', 'monthly_debt', 'interest_burden']
    # For NN, we treat grade_subgrade as categorical to One-Hot Encode it
    categorical_features = ['grade_subgrade', 'loan_purpose', 'gender', 'marital_status', 'education_level', 'employment_status']
    
    log_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('log', FunctionTransformer(np.log1p, validate=False)),
        ('scaler', StandardScaler())
    ])
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('log', log_transformer, log_features),
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    return preprocessor

tree_preprocessor = get_tree_preprocessor(X)
nn_preprocessor = get_nn_preprocessor(X)

In [None]:
# OPTIONAL: Run Tuning on Colab (takes time)
# Set RUN_TUNING = True to re-optimize on the full dataset (recommended for best results)
RUN_TUNING = True

def objective_xgb(trial, X, y, preprocessor):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': 42,
        'n_jobs': -1,
        'eval_metric': 'auc'
    }
    model = XGBClassifier(**params)
    pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])
    # Use 3-fold CV on a subset for speed during tuning, or full data if patient
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

def objective_lgbm(trial, X, y, preprocessor):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }
    model = LGBMClassifier(**params)
    pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

if RUN_TUNING:
    print("Starting Hyperparameter Tuning (this may take a while)...")
    # Use a larger subset for better tuning on Colab
    X_tune = X.sample(n=50000, random_state=42)
    y_tune = y.loc[X_tune.index]
    
    study_xgb = optuna.create_study(direction='maximize')
    study_xgb.optimize(partial(objective_xgb, X=X_tune, y=y_tune, preprocessor=tree_preprocessor), n_trials=20)
    best_xgb_params = study_xgb.best_params
    print(f"Best XGB params: {best_xgb_params}")
    
    study_lgbm = optuna.create_study(direction='maximize')
    study_lgbm.optimize(partial(objective_lgbm, X=X_tune, y=y_tune, preprocessor=tree_preprocessor), n_trials=20)
    best_lgbm_params = study_lgbm.best_params
    print(f"Best LGBM params: {best_lgbm_params}")
else:
    # Fallback to params found in local testing (tuned on 10k rows)
    print("Using pre-tuned parameters...")
    best_xgb_params = {'n_estimators': 152, 'learning_rate': 0.045, 'max_depth': 5, 'subsample': 0.74, 'colsample_bytree': 0.92, 'reg_alpha': 8.43, 'reg_lambda': 6.62}
    best_lgbm_params = {'n_estimators': 126, 'learning_rate': 0.049, 'num_leaves': 39, 'feature_fraction': 0.76, 'bagging_fraction': 0.87, 'bagging_freq': 5}

In [None]:
print("Training Stacking Classifier on FULL dataset...")

best_xgb = XGBClassifier(**best_xgb_params, random_state=42, n_jobs=-1, eval_metric='auc')
best_lgbm = LGBMClassifier(**best_lgbm_params, random_state=42, n_jobs=-1, verbose=-1)
cat_clf = CatBoostClassifier(iterations=500, learning_rate=0.05, depth=6, random_state=42, verbose=0, allow_writing_files=False)
mlp_clf = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, max_iter=200, random_state=42, early_stopping=True)

pipe_xgb = Pipeline([('preprocessor', tree_preprocessor), ('model', best_xgb)])
pipe_lgbm = Pipeline([('preprocessor', tree_preprocessor), ('model', best_lgbm)])
pipe_cat = Pipeline([('preprocessor', tree_preprocessor), ('model', cat_clf)])
pipe_mlp = Pipeline([('preprocessor', nn_preprocessor), ('model', mlp_clf)])

estimators = [
    ('xgb', pipe_xgb),
    ('lgbm', pipe_lgbm),
    ('cat', pipe_cat),
    ('mlp', pipe_mlp)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1,
    passthrough=False
)

stacking_clf.fit(X, y)
print("Training Complete!")

In [None]:
print("Generating predictions...")
test_probabilities = stacking_clf.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    'id': test_df['id'],
    'loan_default': test_probabilities
})

submission.to_csv('final_submission_colab.csv', index=False)
print("Saved submission to final_submission_colab.csv")

# If using Colab, trigger download
# from google.colab import files
# files.download('final_submission_colab.csv')