# Import relevant modules and data

In [1]:
# Import relevant modules
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import optuna
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load modified & cleaned personal loan data into a Pandas DataFrame
df_loan_data = pd.read_pickle('df_loan_data.pkl')

# Transform data to suit the LGBM algorithm

SVC models require scaled data. The data used here is pre-scaled from the data_preparation notebook.

SVC models perform better with balanced classes (and this data has reasonably strong imbalance). This re-balancing will be handled via SMOTE techniques within model training

# Optimize the model as constructed in LGBM.ipynb

In [3]:
# Define the target variable
target_variable = 'personal_loan'

In [4]:
# Separate features and target variable
X = df_loan_data.drop(columns=[target_variable])
y = df_loan_data[target_variable]

In [5]:
# Set up cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [6]:
# Define the objective function for Optuna
def objective(trial):
    C = trial.suggest_float('C', 0.01, 10, log=True)  # Regularization parameter
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Kernel function
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3  # Polynomial degree should be 3

    auc_scores = []

    # Perform cross-validation
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE for class balancing
        smote = SMOTE(sampling_strategy='auto', random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        # Create and train the SVM model
        model = SVC(C=C, kernel=kernel, degree=degree, probability=True)
        model.fit(X_train_resampled, y_train_resampled)

        # Make predictions
        y_pred = model.predict_proba(X_test)[:, 1]

        # Calculate ROC AUC score
        auc = roc_auc_score(y_test, y_pred)
        auc_scores.append(auc)

    # Calculate the mean AUC score across cross-validation folds
    mean_auc = np.mean(auc_scores)
    return mean_auc

In [7]:
# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2023-10-13 23:40:18,103] A new study created in memory with name: no-name-c9eec864-374b-4f2c-bf33-4c9d9482d846
[I 2023-10-13 23:40:46,042] Trial 0 finished with value: 0.7819413648681942 and parameters: {'C': 2.7360599238526806, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.7819413648681942.
[I 2023-10-13 23:40:57,036] Trial 1 finished with value: 0.9389769647696478 and parameters: {'C': 0.010334206334181634, 'kernel': 'linear'}. Best is trial 1 with value: 0.9389769647696478.
[I 2023-10-13 23:41:05,865] Trial 2 finished with value: 0.9362010347376201 and parameters: {'C': 5.459382766819612, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9389769647696478.
[I 2023-10-13 23:41:33,854] Trial 3 finished with value: 0.7815779748706578 and parameters: {'C': 0.6092043375709799, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.9389769647696478.
[I 2023-10-13 23:42:01,955] Trial 4 finished with value: 0.7808518107908353 and parameters: {'C': 0.2670626890491237, 'kernel': 'sigmoid'}

In [8]:
# Get the best hyperparameters
best_params = study.best_params
print('Best Hyperparameters: {}'.format(best_params))

Best Hyperparameters: {'C': 0.12547980708984272, 'kernel': 'linear'}


In [9]:
# Save the best params
with open('SVC_best_params.pkl', 'wb') as file:
    pickle.dump(best_params, file)

# Train and evaluate a model using the best hyperparameters

In [10]:
# Initialize lists to store results from best model
auc_scores = []

In [11]:
# Perform cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to oversample the minority class
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Create and train the SVM model with the best hyperparameters
    model = SVC(C=best_params['C'], kernel=best_params['kernel'], probability=True)
    model.fit(X_train_resampled, y_train_resampled)

    # Make predictions
    y_pred = model.predict_proba(X_test)[:, 1]

    # Calculate ROC AUC score
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)

In [12]:
# Calculate the mean AUC score across cross-validation folds
mean_auc = np.mean(auc_scores)
print('Mean AUC: {}'.format(mean_auc))

Mean AUC: 0.9448903670854889
