# Import relevant modules and data

In [1]:
# Import relevant modules
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE 

In [2]:
# Load modified & cleaned personal loan data into a Pandas DataFrame
df_loan_data = pd.read_pickle('df_loan_data.pkl')

# Transform data to suit the LGBM algorithm

LGBM models do not require scaling, even though the data is pre-scaled from the data_preparation notebook.

LGBM models do perform better with balanced classes (and this data has reasonably strong imbalance). This re-balancing will be handled via SMOTE techniques within model training

# Train a model on all available features

In [3]:
# Define the target variable
target_variable = 'personal_loan'

In [4]:
# Separate features and target variable
X = df_loan_data.drop(columns=[target_variable])
y = df_loan_data[target_variable]

In [5]:
# Initialize LightGBM parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'learning_rate' : 0.1,
    'n_estimators' : 500,
    'max_depth' : 3,
    'subsample' : 0.5,
    'feature_fraction' : 0.7,
    'num_leaves' : 30,
    'min_child_samples' : 10
}

In [6]:
# Initialize cross-validation
n_splits = 5  # You can adjust the number of folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [7]:
# Initialize list to store results
auc_scores = []

In [8]:
# Perform cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to oversample the minority class
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Create a LightGBM Dataset
    train_data = lgb.Dataset(X_train_resampled, label=y_train_resampled)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    # Train the LightGBM model
    num_round = 100
    bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10, verbose_eval=False)

    # Make predictions
    y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

    # Calculate ROC AUC score
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)



[LightGBM] [Info] Number of positive: 3608, number of negative: 3608
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 7216, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3608, number of negative: 3608
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1293
[LightGBM] [Info] Number of data points in the train set: 7216, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3608, number of negative: 3608
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1294
[LightGBM] [Info] Number of data points in the train set: 7216, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightG



In [9]:
# Calculate the mean AUC score across cross-validation folds
mean_auc = np.mean(auc_scores)
print('Mean AUC: {}'.format(mean_auc))

Mean AUC: 0.9506602611480659
