In [27]:
import pandas as pd# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.manifold import TSNE

from catboost import CatBoostClassifier
from optuna import create_study
import optuna

from imblearn.over_sampling import SMOTE
from tqdm import tqdm

import seaborn as sns

In [11]:
# Paths for train and test datasets
TRAIN_SET_PATH = '../data/processed/train_data_selected_engineered.csv'
TEST_SET_PATH = '../data/processed/test_data_selected_engineered.csv'

# Load the dataset
train_data = pd.read_csv(TRAIN_SET_PATH)
test_data = pd.read_csv(TEST_SET_PATH)

In [12]:
# Split the data into training and validation sets
train_set, validation_set = train_test_split(train_data, test_size=0.2, random_state=42)

# Save the resulting datasets
TRAIN_SPLIT_PATH = '../data/processed/train_split.csv'
VALIDATION_SPLIT_PATH = '../data/processed/validation_split.csv'
train_set.to_csv(TRAIN_SPLIT_PATH, index=False)
validation_set.to_csv(VALIDATION_SPLIT_PATH, index=False)

# Display summary of the training and validation sets
print(f"Training set size: {train_set.shape}")
print(f"Validation set size: {validation_set.shape}")


Training set size: (46916, 16)
Validation set size: (11729, 16)


# Model Building

In [15]:
# Prepare features and target variable
X_train = train_set.drop(columns=['loan_status'])
y_train = train_set['loan_status']
X_validation = validation_set.drop(columns=['loan_status'])
y_validation = validation_set['loan_status']


In [16]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [21]:
# Initialize tree-based models to compare
models = {
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_resampled, y_train_resampled)

    # Make predictions on the validation set
    y_pred = model.predict(X_validation)
    y_pred_proba = model.predict_proba(X_validation)[:, 1]

    # Evaluate the model
    roc_auc = roc_auc_score(y_validation, y_pred_proba)
    print(f"{model_name} - ROC AUC Score: {roc_auc}")
    print(classification_report(y_validation, y_pred))





Training Gradient Boosting...
Gradient Boosting - ROC AUC Score: 0.9250002143350415
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10087
           1       0.73      0.73      0.73      1642

    accuracy                           0.92     11729
   macro avg       0.84      0.84      0.84     11729
weighted avg       0.92      0.92      0.92     11729


Training AdaBoost...
AdaBoost - ROC AUC Score: 0.9040645712387491
              precision    recall  f1-score   support

           0       0.96      0.93      0.94     10087
           1       0.62      0.74      0.67      1642

    accuracy                           0.90     11729
   macro avg       0.79      0.83      0.81     11729
weighted avg       0.91      0.90      0.90     11729


Training XGBoost...
XGBoost - ROC AUC Score: 0.9480683401544201
              precision    recall  f1-score   support

           0       0.95      0.98      0.96     10087
           1       0

## Catboost model

In [26]:
# Function to optimize CatBoost hyperparameters using Optuna
def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
        'random_strength': trial.suggest_uniform('random_strength', 1, 10),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 10),
        'random_state': 42,
        'eval_metric': 'AUC',
        'use_best_model': True,
        'logging_level': 'Silent'
    }
    model = CatBoostClassifier(**param)
    model.fit(X_train_resampled, y_train_resampled, eval_set=[(X_validation, y_validation)], early_stopping_rounds=50, verbose=False)
    y_pred_proba = model.predict_proba(X_validation)[:, 1]
    auc = roc_auc_score(y_validation, y_pred_proba)
    return auc

In [28]:
# Create Optuna study and optimize hyperparameters with progress bar
study = create_study(direction='maximize')
for _ in tqdm(range(50), desc='Optimizing CatBoost Hyperparameters'):
    study.optimize(objective, n_trials=1, n_jobs=-1)


[I 2024-10-17 15:33:24,894] A new study created in memory with name: no-name-2d03c073-dd33-4c44-8e71-00d7527af8b2
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
  'random_strength': trial.suggest_uniform('random_strength', 1, 10),
  'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 10),
[I 2024-10-17 15:33:49,911] Trial 0 finished with value: 0.9327055590781638 and parameters: {'iterations': 746, 'depth': 6, 'learning_rate': 0.019594395344222023, 'l2_leaf_reg': 0.5345778121627672, 'random_strength': 3.8389921038631303, 'bagging_temperature': 2.1363348365788615}. Best is trial 0 with value: 0.9327055590781638.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
  'random_strength': trial.suggest_uniform('random_strength', 1, 10),
  'bagging_temperature': trial.suggest_uniform('bagging_t

In [29]:

# Train the CatBoost model with the best parameters
best_params = study.best_params
best_params.update({'random_state': 42, 'eval_metric': 'AUC', 'use_best_model': True})
model = CatBoostClassifier(**best_params)
model.fit(X_train_resampled, y_train_resampled, eval_set=[(X_validation, y_validation)], early_stopping_rounds=50, verbose=False)


<catboost.core.CatBoostClassifier at 0x26082b3de20>

In [30]:

# Make predictions on the validation set
y_pred = model.predict(X_validation)
y_pred_proba = model.predict_proba(X_validation)[:, 1]


In [31]:

# Evaluate the model
roc_auc = roc_auc_score(y_validation, y_pred_proba)
print(f"Optimized CatBoost - ROC AUC Score: {roc_auc}")
print(classification_report(y_validation, y_pred))

Optimized CatBoost - ROC AUC Score: 0.9492886914296292
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     10087
           1       0.84      0.72      0.77      1642

    accuracy                           0.94     11729
   macro avg       0.90      0.85      0.87     11729
weighted avg       0.94      0.94      0.94     11729



## Submission

In [37]:
# Load the test dataset

X_test = test_data
ids = pd.read_csv("../data/raw/test.csv")['id'].values

# Make predictions on the test set
y_test_pred_proba = model.predict_proba(X_test)[:, 1]

# Create a DataFrame for submission
submission = pd.DataFrame({'id': ids, 'loan_status': y_test_pred_proba})

# Save the submission to a CSV file
SUBMISSION_PATH = '../data/processed/submission.csv'
submission.to_csv(SUBMISSION_PATH, index=False)

print(f"Submission file saved to {SUBMISSION_PATH}")

Submission file saved to ../data/processed/submission.csv
