# 05 â€“ Optimized Model with EDA Insights

This notebook implements the optimized preprocessing strategy derived from EDA:
- **Log-transform** `annual_income` to handle skew.
- **Ordinal Encode** `grade_subgrade` to preserve rank information.
- **Keep outliers** as tree-based models handle them well.
- **One-Hot Encode** other categorical features.

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

DATA_DIR = Path('../data')
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Sample for faster iteration if needed, but we use full data for best results or large sample
# train_df = train_df.sample(n=10000, random_state=42)
train_df.shape

## Feature Engineering and Preprocessing

We define specific transformers for different column types.

In [None]:
target_col = 'loan_paid_back'
X = train_df.drop(columns=[target_col])
y = train_df[target_col]

# Define column groups
log_features = ['annual_income']
numeric_features = ['debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']
ordinal_features = ['grade_subgrade']
categorical_features = ['loan_purpose', 'gender', 'marital_status', 'education_level', 'employment_status']

# 1. Log Transformer for annual_income
log_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())
])

# 2. Standard Numeric Transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 3. Ordinal Transformer for grade_subgrade
# We need to define the order. A1 is best (highest prob of payback?), F5 is worst.
# Let's check unique values to be sure, or just rely on alphabetical sort which works for A1..F5
# Alphabetical: A1, A2, ... F5. 
# If A1 is 'better' (higher credit), we might want it to have higher value? 
# Or just mapping them to 0..N is fine for trees. 
# Let's use alphabetical order.
grades = sorted(X['grade_subgrade'].unique())
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[grades], handle_unknown='use_encoded_value', unknown_value=-1))
])

# 4. Categorical Transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('log', log_transformer, log_features),
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Model Selection

We test GradientBoosting and RandomForest as they are likely to benefit from the ordinal encoding and robust to the remaining noise.

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(
        n_estimators=300,
        max_depth=10,  # Limit depth to prevent overfitting given the noise
        min_samples_leaf=5,
        random_state=42
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )
}

results = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use a subset for quick validation if dataset is huge, else full
# X_sample = X.sample(10000, random_state=42)
# y_sample = y.loc[X_sample.index]
X_train_eval = X
y_train_eval = y

for name, model in models.items():
    print(f"Training {name}...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    scores = cross_val_score(pipeline, X_train_eval, y_train_eval, cv=skf, scoring='roc_auc', n_jobs=-1)
    results.append({
        'model': name,
        'mean_auc': scores.mean(),
        'std_auc': scores.std(),
    })
    print(f"{name} AUC: {scores.mean():.4f} (+/- {scores.std():.4f})")

results_df = pd.DataFrame(results).sort_values('mean_auc', ascending=False).reset_index(drop=True)
results_df

In [None]:
best_model_name = results_df.loc[0, 'model']
best_model = models[best_model_name]
print(f"Best model: {best_model_name}")

# Fit on full data
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', best_model)])
best_pipeline.fit(X, y)

# Predict on test
test_probabilities = best_pipeline.predict_proba(test_df)[:, 1]
submission = pd.DataFrame({
    'id': test_df['id'],
    'loan_paid_back': test_probabilities
})
submission.to_csv('data/optimized_submission.csv', index=False)
print("Saved submission to data/optimized_submission.csv")