## IMPORTS

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
import time
import warnings
import os
warnings.filterwarnings('ignore')

np.random.seed(42)
print("✓ All imports successful")

✓ All imports successful


## Importing the dataset

In [2]:
csv_path = r"C:\Users\user\Documents\dev\selastone_loan_default\archive"

df = pd.read_csv(os.path.join(csv_path, 'Loan_Default.csv'))
print(f"✓ Data loaded: {df.shape}")
print(f"\nDataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nFirst rows:")
print(df.head())

✓ Data loaded: (148670, 34)

Dataset shape: 148,670 rows × 34 columns

First rows:
      ID  year loan_limit             Gender approv_in_adv loan_type  \
0  24890  2019         cf  Sex Not Available         nopre     type1   
1  24891  2019         cf               Male         nopre     type2   
2  24892  2019         cf               Male           pre     type1   
3  24893  2019         cf               Male         nopre     type1   
4  24894  2019         cf              Joint           pre     type1   

  loan_purpose Credit_Worthiness open_credit business_or_commercial  ...  \
0           p1                l1        nopc                  nob/c  ...   
1           p1                l1        nopc                    b/c  ...   
2           p1                l1        nopc                  nob/c  ...   
3           p4                l1        nopc                  nob/c  ...   
4           p1                l1        nopc                  nob/c  ...   

   credit_type  Credit_Scor

## EXPLORE & CLEAN

In [3]:
print("\n" + "="*70)
print("DATA EXPLORATION & CLEANING")
print("="*70)

# Check target variable
print(f"\nTarget Variable (Status):")
print(df['Status'].value_counts())
print(f"Default Rate: {df['Status'].mean():.2%}")

# Remove ID column (not a feature)
df = df.drop(['ID'], axis=1)

# Missing values
print(f"\nMissing Values Summary:")
missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
print(missing_pct[missing_pct > 0].head(15))

# Drop columns with >40% missing
drop_cols = missing_pct[missing_pct > 40].index.tolist()
print(f"\nDropping {len(drop_cols)} columns with >40% missing:")
print(drop_cols)
df = df.drop(columns=drop_cols)

print(f"\nDataset shape after cleaning: {df.shape}")


DATA EXPLORATION & CLEANING

Target Variable (Status):
Status
0    112031
1     36639
Name: count, dtype: int64
Default Rate: 24.64%

Missing Values Summary:
Upfront_charges              26.664425
Interest_rate_spread         24.644515
rate_of_interest             24.509989
dtir1                        16.224524
LTV                          10.155378
property_value               10.155378
income                        6.154571
loan_limit                    2.249277
approv_in_adv                 0.610749
submission_of_application     0.134526
age                           0.134526
loan_purpose                  0.090133
Neg_ammortization             0.081388
term                          0.027578
dtype: float64

Dropping 0 columns with >40% missing:
[]

Dataset shape after cleaning: (148670, 33)


## FEATURE ENGINEERING

In [4]:
print("\n" + "="*70)
print("FEATURE ENGINEERING")
print("="*70)

# Extract numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target from features
if 'Status' in numeric_cols:
    numeric_cols.remove('Status')

print(f"Numeric columns ({len(numeric_cols)}): {numeric_cols}")
print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")

# Fill missing numeric values with median
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# Fill missing categorical values with mode
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

print(f"\n✓ Missing values filled")

# Create derived features
df['loan_to_income'] = df['loan_amount'] / (df['income'] + 1)
df['loan_to_property'] = df['loan_amount'] / (df['property_value'] + 1)
df['credit_to_income'] = df['Credit_Score'] / (df['income'] + 1)

# Add new features to numeric columns
new_features = ['loan_to_income', 'loan_to_property', 'credit_to_income']
numeric_cols.extend(new_features)

print(f"✓ Created {len(new_features)} derived features")
print(f"✓ Total numeric features: {len(numeric_cols)}")


FEATURE ENGINEERING
Numeric columns (11): ['year', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1']

Categorical columns (21): ['loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Region', 'Security_Type']

✓ Missing values filled
✓ Created 3 derived features
✓ Total numeric features: 14


## PREPARE FEATURES FOR MODELING

In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

print("\n" + "="*70)
print("PREPARE FEATURES")
print("="*70)

# Separate X and y
X = df[numeric_cols + categorical_cols].copy()
y = df['Status'].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:")
print(f"  0 (No Default): {(y == 0).sum():,}")
print(f"  1 (Default): {(y == 1).sum():,}")

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"✓ Encoded {len(categorical_cols)} categorical columns")

# Handle outliers (clip at 1st and 99th percentiles)
for col in numeric_cols:
    q1 = X[col].quantile(0.01)
    q99 = X[col].quantile(0.99)
    X[col] = X[col].clip(q1, q99)

print(f"✓ Handled outliers")
print(f"\nFinal features: {X.columns.tolist()}")


PREPARE FEATURES
Features shape: (148670, 35)
Target shape: (148670,)
Target distribution:
  0 (No Default): 112,031
  1 (Default): 36,639
✓ Encoded 21 categorical columns
✓ Handled outliers

Final features: ['year', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1', 'loan_to_income', 'loan_to_property', 'credit_to_income', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Region', 'Security_Type']


## TRAIN-TEST SPLIT

In [6]:
from sklearn.model_selection import train_test_split

print("\n" + "="*70)
print("TRAIN-TEST SPLIT")
print("="*70)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"  Default rate: {y_train.mean():.2%}")
print(f"\nTest set: {X_test.shape}")
print(f"  Default rate: {y_test.mean():.2%}")

feature_names = X.columns.tolist()


TRAIN-TEST SPLIT
Training set: (118936, 35)
  Default rate: 24.64%

Test set: (29734, 35)
  Default rate: 24.65%


## SCALE FEATURES

In [7]:
print("\n" + "="*70)
print("SCALE FEATURES")
print("="*70)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ Features scaled (mean=0, std=1)")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")


SCALE FEATURES
✓ Features scaled (mean=0, std=1)
Training set shape: (118936, 35)
Test set shape: (29734, 35)


## HANDLE CLASS IMBALANCE

In [8]:
from imblearn.over_sampling import SMOTE

print("\n" + "="*70)
print("HANDLE CLASS IMBALANCE (SMOTE)")
print("="*70)

# smote = SMOTE(random_state=42, k_neighbors=5)
smote = SMOTE(
    k_neighbors=3,        # Default is 5, try 3 for sparse regions
    sampling_strategy=0.8, # 0.8 instead of 1.0 (keep some imbalance)
    random_state=42
)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"Before SMOTE:")
print(f"  Shape: {X_train_scaled.shape}")
print(f"  Defaults: {y_train.sum():,} ({y_train.mean():.2%})")

print(f"\nAfter SMOTE:")
print(f"  Shape: {X_train_balanced.shape}")
print(f"  Defaults: {y_train_balanced.sum():,} ({y_train_balanced.mean():.2%})")


HANDLE CLASS IMBALANCE (SMOTE)
Before SMOTE:
  Shape: (118936, 35)
  Defaults: 29,311 (24.64%)

After SMOTE:
  Shape: (161325, 35)
  Defaults: 71,700 (44.44%)


## BASELINE MODEL (YOUR CURRENT MODEL)

In [10]:
from sklearn.linear_model import LogisticRegression

print("\n" + "="*70)
print("BASELINE LOGISTIC REGRESSION MODEL (YOUR CURRENT PARAMS)")
print("="*70)

baseline_lr = LogisticRegression(
    max_iter=1000,
    C=1.0,
    penalty='l2',
    solver='lbfgs',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print(f"Training baseline with your current parameters...")
print(f"  max_iter: 1000")
print(f"  C: 1.0")
print(f"  penalty: l2")
print(f"  solver: lbfgs")
print(f"  class_weight: balanced")
print(f"  random_state: 42")

baseline_lr.fit(X_train_balanced, y_train_balanced)

# Evaluate baseline
baseline_pred_proba = baseline_lr.predict_proba(X_test)[:, 1]
baseline_auc = roc_auc_score(y_test, baseline_pred_proba)
baseline_f1 = f1_score(y_test, baseline_lr.predict(X_test))
baseline_accuracy = accuracy_score(y_test, baseline_lr.predict(X_test))

print(f"\n✓ Baseline model trained")
print(f"\nBaseline Results:")
print(f"  AUC-ROC:  {baseline_auc:.4f}")
print(f"  F1-Score: {baseline_f1:.4f}")
print(f"  Accuracy: {baseline_accuracy:.4f}")


BASELINE LOGISTIC REGRESSION MODEL (YOUR CURRENT PARAMS)
Training baseline with your current parameters...
  max_iter: 1000
  C: 1.0
  penalty: l2
  solver: lbfgs
  class_weight: balanced
  random_state: 42

✓ Baseline model trained

Baseline Results:
  AUC-ROC:  0.5000
  F1-Score: 0.0000
  Accuracy: 0.7535


## DEFINE HYPERPARAMETER GRID (TUNING YOUR PARAMS)

In [12]:
print("\n" + "="*70)
print("HYPERPARAMETER GRID - TUNING YOUR PARAMETERS")
print("="*70)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],               # Currently: 1.0
    'penalty': ['l1', 'l2'],                             # Currently: l2
    'solver': ['liblinear', 'lbfgs'],                    # Currently: lbfgs
}

print("Hyperparameter Grid (3 parameters):")
current = {
    'C': 1.0,
    'penalty': 'l2',
    'solver': 'lbfgs'
}

for param, values in param_grid.items():
    if param == 'C':
        marker = " ← CURRENT" if current['C'] in values else ""
        print(f"  {param}: {values}{marker}")
    elif param == 'penalty':
        marker = " ← CURRENT" if current['penalty'] in values else ""
        print(f"  {param}: {values}{marker}")
    elif param == 'solver':
        marker = " ← CURRENT" if current['solver'] in values else ""
        print(f"  {param}: {values}{marker}")

total_combinations = 1
for values in param_grid.values():
    total_combinations *= len(values)

print(f"\nTotal combinations to test: {total_combinations:,}")
print(f"With 5-fold CV: {total_combinations * 5:,} model trainings")
print(f"Estimated time: 5-10 minutes")


HYPERPARAMETER GRID - TUNING YOUR PARAMETERS
Hyperparameter Grid (3 parameters):
  C: [0.001, 0.01, 0.1, 1, 10, 100] ← CURRENT
  penalty: ['l1', 'l2'] ← CURRENT
  solver: ['liblinear', 'lbfgs'] ← CURRENT

Total combinations to test: 24
With 5-fold CV: 120 model trainings
Estimated time: 5-10 minutes


## OPTION A - FULL GRIDSEARCHCV (BEST RESULTS, SLOWER)

In [14]:
print("\n" + "="*70)
print("RUNNING FULL GRIDSEARCHCV (5-10 MINUTES)")
print("="*70)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
import time

grid_search = GridSearchCV(
    estimator=LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print("\nStarting full grid search...")
print(f"Testing {total_combinations} parameter combinations")
print(f"With 5-fold CV: {total_combinations * 5} model trainings\n")

start_time = time.time()
grid_search.fit(X_train_balanced, y_train_balanced)
elapsed = time.time() - start_time

print(f"\n✓ Completed in {elapsed/60:.1f} minutes")

print(f"\nBest CV AUC: {grid_search.best_score_:.4f}")
print(f"\nBest Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Get best model
best_lr = grid_search.best_estimator_

# Evaluate on test set
best_pred_proba = best_lr.predict_proba(X_test)[:, 1]
best_auc = roc_auc_score(y_test, best_pred_proba)
best_f1 = f1_score(y_test, best_lr.predict(X_test))
best_accuracy = accuracy_score(y_test, best_lr.predict(X_test))

print(f"\nBest Model Test Performance:")
print(f"  AUC-ROC:  {best_auc:.4f}")
print(f"  F1-Score: {best_f1:.4f}")
print(f"  Accuracy: {best_accuracy:.4f}")

# Compare with baseline (handle division by zero)
print(f"\n" + "="*70)
print("BASELINE vs TUNED COMPARISON")
print("="*70)

improvement_auc = ((best_auc - baseline_auc) / baseline_auc) * 100 if baseline_auc > 0 else 0
improvement_f1 = ((best_f1 - baseline_f1) / baseline_f1) * 100 if baseline_f1 > 0 else float('inf')

print(f"\n{'Metric':<20} {'Baseline':<12} {'Tuned':<12} {'Improvement':<15}")
print("-" * 60)
print(f"{'AUC-ROC':<20} {baseline_auc:<12.4f} {best_auc:<12.4f} {improvement_auc:+.2f}%")

if baseline_f1 > 0:
    print(f"{'F1-Score':<20} {baseline_f1:<12.4f} {best_f1:<12.4f} {improvement_f1:+.2f}%")
else:
    print(f"{'F1-Score':<20} {baseline_f1:<12.4f} {best_f1:<12.4f} {'N/A (baseline=0)':<15}")

if best_auc > baseline_auc:
    print(f"\n✓ Tuning improved model performance!")
else:
    print(f"\n⚠️ Tuning did not improve AUC")


RUNNING FULL GRIDSEARCHCV (5-10 MINUTES)

Starting full grid search...
Testing 24 parameter combinations
With 5-fold CV: 120 model trainings

Fitting 5 folds for each of 24 candidates, totalling 120 fits

✓ Completed in 1.2 minutes

Best CV AUC: 0.7855

Best Parameters:
  C: 0.01
  penalty: l2
  solver: lbfgs

Best Model Test Performance:
  AUC-ROC:  0.5000
  F1-Score: 0.0000
  Accuracy: 0.7535

BASELINE vs TUNED COMPARISON

Metric               Baseline     Tuned        Improvement    
------------------------------------------------------------
AUC-ROC              0.5000       0.5000       +0.00%
F1-Score             0.0000       0.0000       N/A (baseline=0)

⚠️ Tuning did not improve AUC
