# Import relevant modules and data

In [1]:
# Import relevant modules
import lightgbm as lgb
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import os
import sys
import warnings
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import roc_auc_score
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load modified & cleaned personal loan data into a Pandas DataFrame
df_loan_data = pd.read_pickle('df_loan_data.pkl')

# Transform data to suit the LGBM algorithm

LGBM models do not require scaling, even though the data is pre-scaled from the data_preparation notebook.

LGBM models do perform better with balanced classes (and this data has reasonably strong imbalance). This re-balancing will be handled within the model parameters

# Optimize the model as constructed in LGBM.ipynb

In [3]:
# Define the target variable
target_variable = 'personal_loan'

In [4]:
# Suppress LightGBM UserWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning, module="lightgbm")

In [5]:
# Separate features and target variable
X = df_loan_data.drop(columns=[target_variable])
y = df_loan_data[target_variable]

In [6]:
# Set up cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [7]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'subsample': trial.suggest_float('subsample', 0.5, 0.7),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.7),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 20),  # Adjust the range as needed
    }

    auc_scores = []

    # Perform cross-validation
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # Apply SMOTE for class balancing
        smote = SMOTE(sampling_strategy='auto', random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        # Create a LightGBM Dataset
        train_data = lgb.Dataset(X_train_resampled, label=y_train_resampled)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

        # Redirect output to null device to suppress all output
        devnull = open(os.devnull, 'w')
        stdout_backup = sys.stdout
        sys.stdout = devnull

        # Train the model with custom early stopping and 'log_evaluation' callback
        callbacks = [lgb.callback.log_evaluation(period=1)]
        bst = lgb.train(params, train_data, valid_sets=[valid_data], callbacks=callbacks)

        # Restore the standard output
        sys.stdout = stdout_backup

        # Evaluate the model on the validation set
        auc = bst.best_score['valid_0']['auc']
        auc_scores.append(auc)

    # Calculate the mean AUC score across cross-validation folds
    mean_auc = np.mean(auc_scores)
    return mean_auc

In [8]:
# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-10-13 22:07:50,750] A new study created in memory with name: no-name-6b7eaf59-f8b8-49a2-9be0-a32c92d78825
[I 2023-10-13 22:07:52,339] Trial 0 finished with value: 0.9491438778024144 and parameters: {'learning_rate': 0.042610869903360854, 'n_estimators': 249, 'max_depth': 4, 'subsample': 0.6433951357590532, 'feature_fraction': 0.6934786290291781, 'num_leaves': 27, 'min_child_samples': 17}. Best is trial 0 with value: 0.9491438778024144.
[I 2023-10-13 22:07:54,540] Trial 1 finished with value: 0.9456836659275684 and parameters: {'learning_rate': 0.06927835588411228, 'n_estimators': 233, 'max_depth': 6, 'subsample': 0.6015721928232459, 'feature_fraction': 0.6622876754807476, 'num_leaves': 24, 'min_child_samples': 7}. Best is trial 0 with value: 0.9491438778024144.
[I 2023-10-13 22:07:55,309] Trial 2 finished with value: 0.9497770386794777 and parameters: {'learning_rate': 0.08076751291440921, 'n_estimators': 184, 'max_depth': 3, 'subsample': 0.5487115584734424, 'feature_fraction':

In [9]:
# Get the best hyperparameters
best_params = study.best_params
print('Best Hyperparameters: {}'.format(best_params))

Best Hyperparameters: {'learning_rate': 0.04386668304533329, 'n_estimators': 187, 'max_depth': 3, 'subsample': 0.558386012743904, 'feature_fraction': 0.6054855311843562, 'num_leaves': 23, 'min_child_samples': 11}


In [10]:
# Save the best params
with open('LGBM_best_params.pkl', 'wb') as file:
    pickle.dump(best_params, file)

# Train and evaluate a model using the best hyperparameters

In [11]:
# Initialize lists to store results from best model
auc_scores = []

In [12]:
# Perform cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to oversample the minority class
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Create a LightGBM Dataset
    train_data = lgb.Dataset(X_train_resampled, label=y_train_resampled)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    # Train the LightGBM model
    num_round = 100
    bst = lgb.train(best_params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10, verbose_eval=False)

    # Make predictions
    y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

    # Calculate ROC AUC score
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 7216, number of used features: 8
[LightGBM] [Info] Start training from score 0.500000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1293
[LightGBM] [Info] Number of data points in the train set: 7216, number of used features: 8
[LightGBM] [Info] Start training from score 0.500000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1294
[LightGBM] [Info] Number of data points in the train set: 7216, number of used features: 8
[LightGBM] [Info] Start training from score 0.500000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1291
[LightGBM] [Info] Number of data points in the train set: 7216, number o

In [13]:
# Calculate the mean AUC score across cross-validation folds
mean_auc = np.mean(auc_scores)
print('Mean AUC: {}'.format(mean_auc))

Mean AUC: 0.9466605075141661
