## IMPORTS

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
import time
import warnings
import os
warnings.filterwarnings('ignore')

np.random.seed(42)
print("✓ All imports successful")

✓ All imports successful


## Importing the dataset

In [57]:
csv_path = r"C:\Users\user\Documents\dev\selastone_loan_default\archive"

df = pd.read_csv(os.path.join(csv_path, 'Loan_Default.csv'))
print(f"✓ Data loaded: {df.shape}")
print(f"\nDataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nFirst rows:")
print(df.head())

✓ Data loaded: (148670, 34)

Dataset shape: 148,670 rows × 34 columns

First rows:
      ID  year loan_limit             Gender approv_in_adv loan_type  \
0  24890  2019         cf  Sex Not Available         nopre     type1   
1  24891  2019         cf               Male         nopre     type2   
2  24892  2019         cf               Male           pre     type1   
3  24893  2019         cf               Male         nopre     type1   
4  24894  2019         cf              Joint           pre     type1   

  loan_purpose Credit_Worthiness open_credit business_or_commercial  ...  \
0           p1                l1        nopc                  nob/c  ...   
1           p1                l1        nopc                    b/c  ...   
2           p1                l1        nopc                  nob/c  ...   
3           p4                l1        nopc                  nob/c  ...   
4           p1                l1        nopc                  nob/c  ...   

   credit_type  Credit_Scor

## EXPLORE & CLEAN

In [58]:
print("\n" + "="*70)
print("DATA EXPLORATION & CLEANING")
print("="*70)

# Check target variable
print(f"\nTarget Variable (Status):")
print(df['Status'].value_counts())
print(f"Default Rate: {df['Status'].mean():.2%}")

# Remove ID column (not a feature)
df = df.drop(['ID'], axis=1)

# Missing values
print(f"\nMissing Values Summary:")
missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
print(missing_pct[missing_pct > 0].head(15))

# Drop columns with >40% missing
drop_cols = missing_pct[missing_pct > 40].index.tolist()
print(f"\nDropping {len(drop_cols)} columns with >40% missing:")
print(drop_cols)
df = df.drop(columns=drop_cols)

print(f"\nDataset shape after cleaning: {df.shape}")


DATA EXPLORATION & CLEANING

Target Variable (Status):
Status
0    112031
1     36639
Name: count, dtype: int64
Default Rate: 24.64%

Missing Values Summary:
Upfront_charges              26.664425
Interest_rate_spread         24.644515
rate_of_interest             24.509989
dtir1                        16.224524
LTV                          10.155378
property_value               10.155378
income                        6.154571
loan_limit                    2.249277
approv_in_adv                 0.610749
submission_of_application     0.134526
age                           0.134526
loan_purpose                  0.090133
Neg_ammortization             0.081388
term                          0.027578
dtype: float64

Dropping 0 columns with >40% missing:
[]

Dataset shape after cleaning: (148670, 33)


## FEATURE ENGINEERING

In [59]:
print("\n" + "="*70)
print("FEATURE ENGINEERING")
print("="*70)

# Extract numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target from features
if 'Status' in numeric_cols:
    numeric_cols.remove('Status')

print(f"Numeric columns ({len(numeric_cols)}): {numeric_cols}")
print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")

# Fill missing numeric values with median
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# Fill missing categorical values with mode
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

print(f"\n✓ Missing values filled")

# Create derived features
df['loan_to_income'] = df['loan_amount'] / (df['income'] + 1)
df['loan_to_property'] = df['loan_amount'] / (df['property_value'] + 1)
df['credit_to_income'] = df['Credit_Score'] / (df['income'] + 1)

# Add new features to numeric columns
new_features = ['loan_to_income', 'loan_to_property', 'credit_to_income']
numeric_cols.extend(new_features)

print(f"✓ Created {len(new_features)} derived features")
print(f"✓ Total numeric features: {len(numeric_cols)}")


FEATURE ENGINEERING
Numeric columns (11): ['year', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1']

Categorical columns (21): ['loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Region', 'Security_Type']

✓ Missing values filled
✓ Created 3 derived features
✓ Total numeric features: 14


## PREPARE FEATURES FOR MODELING

In [60]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

print("\n" + "="*70)
print("PREPARE FEATURES")
print("="*70)

# Separate X and y
X = df[numeric_cols + categorical_cols].copy()
y = df['Status'].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:")
print(f"  0 (No Default): {(y == 0).sum():,}")
print(f"  1 (Default): {(y == 1).sum():,}")

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"✓ Encoded {len(categorical_cols)} categorical columns")

# Handle outliers (clip at 1st and 99th percentiles)
for col in numeric_cols:
    q1 = X[col].quantile(0.01)
    q99 = X[col].quantile(0.99)
    X[col] = X[col].clip(q1, q99)

print(f"✓ Handled outliers")
print(f"\nFinal features: {X.columns.tolist()}")


PREPARE FEATURES
Features shape: (148670, 35)
Target shape: (148670,)
Target distribution:
  0 (No Default): 112,031
  1 (Default): 36,639
✓ Encoded 21 categorical columns
✓ Handled outliers

Final features: ['year', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1', 'loan_to_income', 'loan_to_property', 'credit_to_income', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Region', 'Security_Type']


## TRAIN-TEST SPLIT

In [61]:
from sklearn.model_selection import train_test_split

print("\n" + "="*70)
print("TRAIN-TEST SPLIT")
print("="*70)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"  Default rate: {y_train.mean():.2%}")
print(f"\nTest set: {X_test.shape}")
print(f"  Default rate: {y_test.mean():.2%}")

feature_names = X.columns.tolist()


TRAIN-TEST SPLIT
Training set: (118936, 35)
  Default rate: 24.64%

Test set: (29734, 35)
  Default rate: 24.65%


## SCALE FEATURES

In [62]:
print("\n" + "="*70)
print("SCALE FEATURES")
print("="*70)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ Features scaled (mean=0, std=1)")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")


SCALE FEATURES
✓ Features scaled (mean=0, std=1)
Training set shape: (118936, 35)
Test set shape: (29734, 35)


## HANDLE CLASS IMBALANCE

In [63]:
from imblearn.over_sampling import SMOTE

print("\n" + "="*70)
print("HANDLE CLASS IMBALANCE (SMOTE)")
print("="*70)

# smote = SMOTE(random_state=42, k_neighbors=5)
smote = SMOTE(
    k_neighbors=3,        # Default is 5, try 3 for sparse regions
    sampling_strategy=0.8, # 0.8 instead of 1.0 (keep some imbalance)
    random_state=42
)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"Before SMOTE:")
print(f"  Shape: {X_train_scaled.shape}")
print(f"  Defaults: {y_train.sum():,} ({y_train.mean():.2%})")

print(f"\nAfter SMOTE:")
print(f"  Shape: {X_train_balanced.shape}")
print(f"  Defaults: {y_train_balanced.sum():,} ({y_train_balanced.mean():.2%})")


HANDLE CLASS IMBALANCE (SMOTE)
Before SMOTE:
  Shape: (118936, 35)
  Defaults: 29,311 (24.64%)

After SMOTE:
  Shape: (161325, 35)
  Defaults: 71,700 (44.44%)


## BASELINE MODEL (YOUR CURRENT MODEL)

In [64]:
print("\n" + "="*70)
print("BASELINE LIGHTGBM MODEL (YOUR CURRENT PARAMS)")
print("="*70)

# Calculate scale_pos_weight for imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

baseline_lgb = lgb.LGBMClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_samples=20,
    reg_lambda=1.0,
    reg_alpha=0.5,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print(f"Training baseline with your current parameters...")
print(f"  n_estimators: 500")
print(f"  max_depth: 4")
print(f"  learning_rate: 0.03")
print(f"  subsample: 0.7")
print(f"  colsample_bytree: 0.7")
print(f"  min_child_samples: 20")
print(f"  reg_lambda: 1.0")
print(f"  reg_alpha: 0.5")
print(f"  scale_pos_weight: {scale_pos_weight:.2f}")

baseline_lgb.fit(X_train_balanced, y_train_balanced)

# Evaluate baseline
baseline_pred_proba = baseline_lgb.predict_proba(X_test)[:, 1]
baseline_auc = roc_auc_score(y_test, baseline_pred_proba)
baseline_f1 = f1_score(y_test, baseline_lgb.predict(X_test))
baseline_accuracy = accuracy_score(y_test, baseline_lgb.predict(X_test))

print(f"\n✓ Baseline model trained")
print(f"\nBaseline Results:")
print(f"  AUC-ROC:  {baseline_auc:.4f}")
print(f"  F1-Score: {baseline_f1:.4f}")
print(f"  Accuracy: {baseline_accuracy:.4f}")


BASELINE LIGHTGBM MODEL (YOUR CURRENT PARAMS)
Training baseline with your current parameters...
  n_estimators: 500
  max_depth: 4
  learning_rate: 0.03
  subsample: 0.7
  colsample_bytree: 0.7
  min_child_samples: 20
  reg_lambda: 1.0
  reg_alpha: 0.5
  scale_pos_weight: 3.06

✓ Baseline model trained

Baseline Results:
  AUC-ROC:  0.4144
  F1-Score: 0.0000
  Accuracy: 0.7534


## DEFINE HYPERPARAMETER GRID (TUNING YOUR PARAMS)

In [65]:
print("\n" + "="*70)
print("HYPERPARAMETER GRID - TUNING YOUR PARAMETERS")
print("="*70)

# Grid of hyperparameters to search (focusing on your parameters)
param_grid = {
    'n_estimators': [300, 400, 500, 600, 700],          # Currently: 500
    'max_depth': [3, 4, 5, 6, 7],                        # Currently: 4
    'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.1],     # Currently: 0.03
    'subsample': [0.6, 0.65, 0.7, 0.75, 0.8],           # Currently: 0.7
    'colsample_bytree': [0.6, 0.65, 0.7, 0.75, 0.8],    # Currently: 0.7
    'min_child_samples': [10, 15, 20, 25, 30],          # Currently: 20
    'reg_lambda': [0.5, 1.0, 1.5, 2.0],                 # Currently: 1.0
    'reg_alpha': [0.0, 0.25, 0.5, 1.0]                  # Currently: 0.5
}

print("Hyperparameter Grid (8 parameters):")
for param, values in param_grid.items():
    current = {
        'n_estimators': 500,
        'max_depth': 4,
        'learning_rate': 0.03,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'min_child_samples': 20,
        'reg_lambda': 1.0,
        'reg_alpha': 0.5
    }
    marker = " ← CURRENT" if param in current and current[param] in values else ""
    print(f"  {param}: {values}{marker}")

total_combinations = 1
for values in param_grid.values():
    total_combinations *= len(values)
print(f"\nTotal combinations to test: {total_combinations:,}")
print(f"With 5-fold CV: {total_combinations * 5:,} model trainings")
print(f"Estimated time: 60-120 minutes (or more)")


HYPERPARAMETER GRID - TUNING YOUR PARAMETERS
Hyperparameter Grid (8 parameters):
  n_estimators: [300, 400, 500, 600, 700] ← CURRENT
  max_depth: [3, 4, 5, 6, 7] ← CURRENT
  learning_rate: [0.01, 0.02, 0.03, 0.05, 0.1] ← CURRENT
  subsample: [0.6, 0.65, 0.7, 0.75, 0.8] ← CURRENT
  colsample_bytree: [0.6, 0.65, 0.7, 0.75, 0.8] ← CURRENT
  min_child_samples: [10, 15, 20, 25, 30] ← CURRENT
  reg_lambda: [0.5, 1.0, 1.5, 2.0] ← CURRENT
  reg_alpha: [0.0, 0.25, 0.5, 1.0] ← CURRENT

Total combinations to test: 250,000
With 5-fold CV: 1,250,000 model trainings
Estimated time: 60-120 minutes (or more)


## OPTION A - FULL GRIDSEARCHCV (BEST RESULTS, SLOWER)

In [66]:
print("\n" + "="*70)
print("OPTION A: FULL GRIDSEARCHCV (TAKES 1-2 HOURS)")
print("="*70)

print("\n⚠️  WARNING: Full grid search with 8 parameters is VERY slow!")
print("Uncomment below to run. Otherwise, skip to Option B.")

# UNCOMMENT TO RUN FULL GRID SEARCH (NOT RECOMMENDED FOR FIRST TIME)
"""
grid_search_full = GridSearchCV(
    estimator=lgb.LGBMClassifier(
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print("\nRunning full grid search (this will take 60-120 minutes)...")
start_time = time.time()
grid_search_full.fit(X_train_balanced, y_train_balanced)
elapsed = time.time() - start_time

print(f"\n✓ Completed in {elapsed/60:.1f} minutes")
print(f"Best CV AUC: {grid_search_full.best_score_:.4f}")
print(f"Best parameters: {grid_search_full.best_params_}")
"""



OPTION A: FULL GRIDSEARCHCV (TAKES 1-2 HOURS)

Uncomment below to run. Otherwise, skip to Option B.


'\ngrid_search_full = GridSearchCV(\n    estimator=lgb.LGBMClassifier(\n        scale_pos_weight=scale_pos_weight,\n        random_state=42,\n        n_jobs=-1,\n        verbose=-1\n    ),\n    param_grid=param_grid,\n    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),\n    scoring=\'roc_auc\',\n    n_jobs=-1,\n    verbose=1\n)\n\nprint("\nRunning full grid search (this will take 60-120 minutes)...")\nstart_time = time.time()\ngrid_search_full.fit(X_train_balanced, y_train_balanced)\nelapsed = time.time() - start_time\n\nprint(f"\n✓ Completed in {elapsed/60:.1f} minutes")\nprint(f"Best CV AUC: {grid_search_full.best_score_:.4f}")\nprint(f"Best parameters: {grid_search_full.best_params_}")\n'

## OPTION B - REDUCED GRID (FASTER, RECOMMENDED)

In [67]:
print("\n" + "="*70)
print("OPTION B: REDUCED GRIDSEARCHCV (TAKES 15-30 MINUTES) ⭐ RECOMMENDED")
print("="*70)

# Reduced grid: focus on key parameters
param_grid_reduced = {
    'max_depth': [3, 4, 5, 6],              # Drop 7 (usually overfits)
    'learning_rate': [0.02, 0.03, 0.05],   # Drop 0.01, 0.1 (extremes)
    'subsample': [0.65, 0.7, 0.75],        # Drop 0.6, 0.8 (extremes)
    'colsample_bytree': [0.65, 0.7, 0.75], # Drop 0.6, 0.8 (extremes)
    'min_child_samples': [15, 20, 25],     # Drop 10, 30 (extremes)
    'reg_lambda': [0.5, 1.0, 1.5],         # Drop 2.0 (extreme)
    'reg_alpha': [0.25, 0.5, 1.0]          # Drop 0.0 (no regularization)
}

print("Reduced Hyperparameter Grid (7 parameters, better values):")
for param, values in param_grid_reduced.items():
    print(f"  {param}: {values}")

total_combinations_reduced = 1
for values in param_grid_reduced.values():
    total_combinations_reduced *= len(values)
print(f"\nTotal combinations to test: {total_combinations_reduced:,}")
print(f"With 5-fold CV: {total_combinations_reduced * 5:,} model trainings")
print(f"Estimated time: 15-30 minutes")


OPTION B: REDUCED GRIDSEARCHCV (TAKES 15-30 MINUTES) ⭐ RECOMMENDED
Reduced Hyperparameter Grid (7 parameters, better values):
  max_depth: [3, 4, 5, 6]
  learning_rate: [0.02, 0.03, 0.05]
  subsample: [0.65, 0.7, 0.75]
  colsample_bytree: [0.65, 0.7, 0.75]
  min_child_samples: [15, 20, 25]
  reg_lambda: [0.5, 1.0, 1.5]
  reg_alpha: [0.25, 0.5, 1.0]

Total combinations to test: 2,916
With 5-fold CV: 14,580 model trainings
Estimated time: 15-30 minutes


## RUN REDUCED GRIDSEARCHCV

In [68]:
from sklearn.model_selection import RandomizedSearchCV

print("\n" + "="*70)
print("RUNNING RANDOMIZEDSEARCHCV (3-5 MINUTES)")
print("="*70)

# param_dist = {
#     'max_depth': [3, 4, 5, 6, 7, 8],
#     'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
#     'num_leaves': [20, 31, 50, 63, 100],
#     'subsample': [0.6, 0.7, 0.8, 0.9],
# }

lgb_base = lgb.LGBMClassifier(
    n_estimators=200,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

random_search = RandomizedSearchCV(
    estimator=lgb_base,
    param_distributions=param_grid_reduced,
    n_iter=20,                 # Only test 20 random combinations
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

print("Starting random search...")
start_time = time.time()
random_search.fit(X_train_balanced, y_train_balanced)
elapsed_time = time.time() - start_time

print(f"\n✓ Completed in {elapsed_time/60:.1f} minutes")
print(f"  Best AUC: {random_search.best_score_:.4f}")
print(f"  Best params: {random_search.best_params_}")



RUNNING RANDOMIZEDSEARCHCV (3-5 MINUTES)
Starting random search...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

✓ Completed in 2.2 minutes
  Best AUC: 1.0000
  Best params: {'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'min_child_samples': 25, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 0.65}


## ANALYZE RESULTS

In [69]:
print("\n" + "="*70)
print("RandomizedSearchCV RESULTS")
print("="*70)

print(f"\nBest Parameters Found:")
best_params = random_search.best_params_
for param, value in sorted(best_params.items()):
    print(f"  {param}: {value}")

print(f"\nBest CV Score (AUC-ROC): {random_search.best_score_:.4f}")

# Get best model
best_lgb = random_search.best_estimator_

# Evaluate on test set
best_pred_proba = best_lgb.predict_proba(X_test)[:, 1]
best_auc = roc_auc_score(y_test, best_pred_proba)
best_f1 = f1_score(y_test, best_lgb.predict(X_test))
best_accuracy = accuracy_score(y_test, best_lgb.predict(X_test))

print(f"\nBest Model Test Performance:")
print(f"  AUC-ROC:  {best_auc:.4f}")
print(f"  F1-Score: {best_f1:.4f}")
print(f"  Accuracy: {best_accuracy:.4f}")


RandomizedSearchCV RESULTS

Best Parameters Found:
  colsample_bytree: 0.65
  learning_rate: 0.05
  max_depth: 6
  min_child_samples: 25
  reg_alpha: 0.5
  reg_lambda: 0.5
  subsample: 0.7

Best CV Score (AUC-ROC): 1.0000

Best Model Test Performance:
  AUC-ROC:  0.3899
  F1-Score: 0.0000
  Accuracy: 0.7534


## COMPARE BASELINE VS TUNED

In [70]:
print("\n" + "="*70)
print("BASELINE vs TUNED COMPARISON")
print("="*70)

# ========== TUNED MODEL EVALUATION ==========
print("Evaluating tuned model...")
best_pred = best_lgb.predict(X_test)
best_pred_proba = best_lgb.predict_proba(X_test)[:, 1]

best_auc = roc_auc_score(y_test, best_pred_proba)
best_f1 = f1_score(y_test, best_pred)
best_accuracy = accuracy_score(y_test, best_pred)

# ========== SAFE IMPROVEMENT CALCULATION ==========
def safe_improvement(new_val, baseline_val):
    """Calculate % improvement, handle division by zero"""
    if baseline_val == 0:
        return float('inf') if new_val > 0 else 0
    return ((new_val - baseline_val) / baseline_val) * 100

improvement_auc = safe_improvement(best_auc, baseline_auc)
improvement_f1 = safe_improvement(best_f1, baseline_f1)
improvement_accuracy = safe_improvement(best_accuracy, baseline_accuracy)

# ========== DISPLAY COMPARISON ==========
print(f"\nMetric Comparison:")
print(f"{'Metric':<20} {'Baseline':<12} {'Tuned':<12} {'Improvement':<15}")
print("-" * 60)
print(f"{'AUC-ROC':<20} {baseline_auc:<12.4f} {best_auc:<12.4f} {improvement_auc:+.2f}%")

if baseline_f1 > 0:
    print(f"{'F1-Score':<20} {baseline_f1:<12.4f} {best_f1:<12.4f} {improvement_f1:+.2f}%")
else:
    print(f"{'F1-Score':<20} {baseline_f1:<12.4f} {best_f1:<12.4f} {'N/A':<15}")

print(f"{'Accuracy':<20} {baseline_accuracy:<12.4f} {best_accuracy:<12.4f} {improvement_accuracy:+.2f}%")

# ========== INTERPRETATION ==========
print("\n" + "="*70)
if best_auc > baseline_auc:
    print(f"✓ Tuning improved model performance!")
    print(f"  AUC improvement: {improvement_auc:+.2f}%")
    if improvement_f1 > 0:
        print(f"  F1-Score improvement: {improvement_f1:+.2f}%")
    if improvement_accuracy > 0:
        print(f"  Accuracy improvement: {improvement_accuracy:+.2f}%")
elif best_auc == baseline_auc:
    print(f"~ No change in AUC after tuning")
    print(f"  Baseline params already near optimal")
else:
    print(f"⚠️ Tuning decreased AUC performance")
    print(f"  AUC drop: {improvement_auc:.2f}%")
    print(f"  Recommendation: Use baseline model")

print("="*70)




BASELINE vs TUNED COMPARISON
Evaluating tuned model...

Metric Comparison:
Metric               Baseline     Tuned        Improvement    
------------------------------------------------------------
AUC-ROC              0.4144       0.3899       -5.89%
F1-Score             0.0000       0.0000       N/A            
Accuracy             0.7534       0.7534       +0.00%

⚠️ Tuning decreased AUC performance
  AUC drop: -5.89%
  Recommendation: Use baseline model


## REMOVE SUSPECTED LEAKY FEATURE

In [73]:
print("\n" + "="*70)
print("CHECKING FOR DATA LEAKAGE")
print("="*70)

# ========== CONVERT TO DATAFRAME IF NEEDED ==========
if isinstance(X_train_balanced, np.ndarray):
    X_train_balanced = pd.DataFrame(X_train_balanced, columns=feature_names)
if isinstance(X_test, np.ndarray):
    X_test = pd.DataFrame(X_test, columns=feature_names)

# ========== REMOVE SUSPECTED LEAKY FEATURE ==========
print("\nRetraining WITHOUT 'Interest_rate_spread'...")

# Drop the suspicious feature
X_train_no_leak = X_train_balanced.drop('Interest_rate_spread', axis=1)
X_test_no_leak = X_test.drop('Interest_rate_spread', axis=1)

feature_names_no_leak = [col for col in feature_names if col != 'Interest_rate_spread']

print(f"Original features: {X_train_balanced.shape[1]}")
print(f"After removing 'interest_rate_spread': {X_train_no_leak.shape[1]}")

# ========== RETRAIN WITH GRID SEARCH ==========
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
    'num_leaves': [20, 31, 50, 63, 100],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'reg_lambda': [0.5, 1.0, 1.5],
    'reg_alpha': [0.0, 0.5, 1.0],
}

lgb_base = lgb.LGBMClassifier(
    n_estimators=200,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

random_search = RandomizedSearchCV(
    estimator=lgb_base,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

print("Starting random search without leaky feature...")
start_time = time.time()
random_search.fit(X_train_no_leak, y_train_balanced)
elapsed_time = time.time() - start_time

best_model_no_leak = random_search.best_estimator_

print(f"\n✓ Completed in {elapsed_time/60:.1f} minutes")
print(f"  Best CV AUC: {random_search.best_score_:.4f}")
print(f"  Best params: {random_search.best_params_}")

# ========== EVALUATE WITHOUT LEAKY FEATURE ==========
print("\n" + "="*70)
print("COMPARISON: WITH vs WITHOUT LEAKY FEATURE")
print("="*70)

# Model WITH leaky feature (original)
best_pred_with_leak = best_lgb.predict(X_test)
best_auc_with_leak = roc_auc_score(y_test, best_lgb.predict_proba(X_test)[:, 1])

# Model WITHOUT leaky feature (new)
best_pred_no_leak = best_model_no_leak.predict(X_test_no_leak)
best_auc_no_leak = roc_auc_score(y_test, best_model_no_leak.predict_proba(X_test_no_leak)[:, 1])

print(f"\nAUC Comparison:")
print(f"  WITH 'interest_rate_spread':    {best_auc_with_leak:.4f}")
print(f"  WITHOUT 'interest_rate_spread': {best_auc_no_leak:.4f}")
print(f"  Drop:                           {best_auc_with_leak - best_auc_no_leak:.4f}")

# ========== LEAKAGE DIAGNOSIS ==========
auc_drop = best_auc_with_leak - best_auc_no_leak

if auc_drop > 0.1:
    print(f"\n⚠️  LEAKAGE CONFIRMED")
    print(f"    'interest_rate_spread' encodes target information")
    print(f"    Use model WITHOUT this feature for production")
    print(f"    True realistic AUC: {best_auc_no_leak:.4f}")
elif auc_drop > 0.05:
    print(f"\n⚠️  LIKELY LEAKAGE")
    print(f"    Feature contributes disproportionately to predictions")
    print(f"    Investigate feature engineering process")
else:
    print(f"\n✓ Feature appears legitimate")
    print(f"    Minimal impact when removed")
    print(f"    Keep for production (use original model)")

print("="*70)


CHECKING FOR DATA LEAKAGE

Retraining WITHOUT 'Interest_rate_spread'...
Original features: 35
After removing 'interest_rate_spread': 34
Starting random search without leaky feature...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

✓ Completed in 2.6 minutes
  Best CV AUC: 0.9997
  Best params: {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 1.0, 'num_leaves': 50, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.6}

COMPARISON: WITH vs WITHOUT LEAKY FEATURE

AUC Comparison:
  WITH 'interest_rate_spread':    0.3899
  WITHOUT 'interest_rate_spread': 0.3542
  Drop:                           0.0357

✓ Feature appears legitimate
    Minimal impact when removed
    Keep for production (use original model)
