# 06 â€“ Advanced Model with Feature Engineering & Ensembling

This notebook pushes the performance further by:
1.  **Feature Engineering**: Creating interaction terms like `loan_to_income` and `monthly_debt`.
2.  **Advanced Models**: Using XGBoost, LightGBM, and CatBoost.
3.  **Ensembling**: Combining predictions using a Voting Classifier.

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

DATA_DIR = Path('../data')
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(f"Train shape: {train_df.shape}")

## Feature Engineering

We create new features that might help the model understand the borrower's financial situation better.

In [None]:
def feature_engineering(df):
    df = df.copy()
    # Loan to Income Ratio: How big is the loan relative to income?
    # Note: This is distinct from 'debt_to_income_ratio' which typically includes ALL debt obligations.
    # Correlation between these two is near zero, so this adds new information.
    df['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)
    
    # Monthly Debt: Estimated monthly debt payment
    df['monthly_debt'] = (df['annual_income'] / 12) * df['debt_to_income_ratio']
    
    # Interest Burden: Total interest to be paid (approx)
    df['interest_burden'] = df['loan_amount'] * (df['interest_rate'] / 100)
    
    # Credit Score Binning (Optional, but trees find splits easily so maybe redundant, but let's try)
    # df['credit_score_bin'] = pd.cut(df['credit_score'], bins=[0, 580, 670, 740, 800, 850], labels=['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])
    
    return df

X_train_full = feature_engineering(train_df)
X_test_full = feature_engineering(test_df)

target_col = 'loan_paid_back'
X = X_train_full.drop(columns=[target_col])
y = X_train_full[target_col]
X_test = X_test_full

## Preprocessing Pipeline

We update the pipeline to include the new columns.

In [None]:
# Define column groups
log_features = ['annual_income']
numeric_features = ['debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate',
                    'loan_to_income', 'monthly_debt', 'interest_burden']
ordinal_features = ['grade_subgrade']
categorical_features = ['loan_purpose', 'gender', 'marital_status', 'education_level', 'employment_status']

# Transformers
log_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

grades = sorted(X['grade_subgrade'].unique())
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[grades], handle_unknown='use_encoded_value', unknown_value=-1))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('log', log_transformer, log_features),
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Model Training & Ensembling

We define the advanced models and a voting classifier.

In [None]:
xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric='auc'
)

lgbm_clf = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

cat_clf = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_state=42,
    verbose=0,
    allow_writing_files=False
)

# Create pipelines for individual models
pipe_xgb = Pipeline([('preprocessor', preprocessor), ('model', xgb_clf)])
pipe_lgbm = Pipeline([('preprocessor', preprocessor), ('model', lgbm_clf)])
pipe_cat = Pipeline([('preprocessor', preprocessor), ('model', cat_clf)])

# Voting Ensemble
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', pipe_xgb),
        ('lgbm', pipe_lgbm),
        ('cat', pipe_cat)
    ],
    voting='soft'
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Evaluating Ensemble...")
# Use a subset for quick check if needed
# X_sample = X.sample(20000, random_state=42)
# y_sample = y.loc[X_sample.index]
scores = cross_val_score(voting_clf, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)
print(f"Ensemble AUC: {scores.mean():.4f} (+/- {scores.std():.4f})")

In [None]:
# Fit on full data
voting_clf.fit(X, y)

# Predict
test_probabilities = voting_clf.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({
    'id': test_df['id'],
    'loan_paid_back': test_probabilities
})
submission.to_csv('data/advanced_submission.csv', index=False)
print("Saved submission to data/advanced_submission.csv")