In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from tqdm import tqdm


In [2]:
# Paths for train and test datasets
TRAIN_SET_PATH = '../data/processed/train_data_selected_engineered.csv'
TEST_SET_PATH = '../data/processed/test_data_selected_engineered.csv'

# Load the dataset
train_data = pd.read_csv(TRAIN_SET_PATH)
test_data = pd.read_csv(TEST_SET_PATH)

In [3]:
# Split the data into training and validation sets
train_set, validation_set = train_test_split(train_data, test_size=0.2, random_state=42)

# Save the resulting datasets
TRAIN_SPLIT_PATH = '../data/processed/train_split.csv'
VALIDATION_SPLIT_PATH = '../data/processed/validation_split.csv'
train_set.to_csv(TRAIN_SPLIT_PATH, index=False)
validation_set.to_csv(VALIDATION_SPLIT_PATH, index=False)

# Display summary of the training and validation sets
print(f"Training set size: {train_set.shape}")
print(f"Validation set size: {validation_set.shape}")


Training set size: (46916, 16)
Validation set size: (11729, 16)


In [4]:

# Prepare features and target variable
X_train = train_set.drop(columns=['loan_status'])
y_train = train_set['loan_status']
X_validation = validation_set.drop(columns=['loan_status'])
y_validation = validation_set['loan_status']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [5]:

# Define base models for stacking
base_models = [
    ('xgboost', XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('catboost', CatBoostClassifier(n_estimators=100, random_state=42, verbose=0)),
    ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))
]

# Define the meta-model
meta_model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)

# Create the stacking classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)




In [7]:
# Train the stacking model with progress bar
print("Training the stacking model...")
for _ in tqdm(range(1), desc='Training Stacking Model'):
    stacking_model.fit(X_train_resampled, y_train_resampled)

Training the stacking model...


Training Stacking Model: 100%|█████████████████████████████████████████████████████████████████████| 1/1 [01:22<00:00, 82.58s/it]


In [8]:
# Make predictions on the validation set
y_pred = stacking_model.predict(X_validation)
y_pred_proba = stacking_model.predict_proba(X_validation)[:, 1]

# Evaluate the model
roc_auc = roc_auc_score(y_validation, y_pred_proba)
print(f"Stacking Model - ROC AUC Score: {roc_auc}")
print(classification_report(y_validation, y_pred))

Stacking Model - ROC AUC Score: 0.9459910713455544
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     10087
           1       0.81      0.73      0.76      1642

    accuracy                           0.94     11729
   macro avg       0.88      0.85      0.86     11729
weighted avg       0.94      0.94      0.94     11729



In [None]:

# Save the trained stacking model
import joblib
STACKING_MODEL_PATH = '../models/stacking_model.pkl'
joblib.dump(stacking_model, STACKING_MODEL_PATH)
print(f"Stacking model saved to {STACKING_MODEL_PATH}")
