In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
random_state=0

In [4]:
def objective(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_validation, label=y_validation)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 1, 15, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)

    (tn, fp, fn, tp) = confusion_matrix(y_validation, pred_labels).ravel()
    accuracy = float(tp+tn)/float(tn+fp+fn+tp)
    
    return accuracy

In [5]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")

[I 2023-12-20 22:18:09,035] A new study created in memory with name: no-name-ec5d9b2a-4462-430c-bf60-280332d4b09c
[I 2023-12-20 22:18:09,077] Trial 0 finished with value: 0.6925925925925925 and parameters: {'booster': 'gbtree', 'lambda': 0.0006820624877381682, 'alpha': 0.20176618401791643, 'subsample': 0.5582198127608542, 'colsample_bytree': 0.9571138017972052, 'max_depth': 3, 'min_child_weight': 4, 'eta': 0.007978388825007366, 'gamma': 8.615911406479546e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.6925925925925925.
[I 2023-12-20 22:18:09,083] Trial 1 finished with value: 0.762962962962963 and parameters: {'booster': 'gblinear', 'lambda': 1.1618877014630359e-06, 'alpha': 0.00760552374879346, 'subsample': 0.8840914612142043, 'colsample_bytree': 0.8614524380524793}. Best is trial 1 with value: 0.762962962962963.
[I 2023-12-20 22:18:09,097] Trial 2 finished with value: 0.6925925925925925 and parameters: {'booster': 'dart', 'lambda': 0.00012073776682102024, 'alpha': 0.00

Number of finished trials:  150
Best trial:


In [6]:
best_trial = study.best_trial

print("  Value: {}".format(best_trial.value))
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

  Value: 0.7925925925925926
  Params: 
    booster: gblinear
    lambda: 0.0005248316472612652
    alpha: 0.00015178511992675392
    subsample: 0.9205283712298872
    colsample_bytree: 0.5100480460825894


## Run on the test set and use training and validation sets for training

In [7]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))
dtrain_valid = xgb.DMatrix(X_train_valid, label=y_train_valid)
dtest = xgb.DMatrix(X_test, label=y_test)

bst = xgb.train(best_trial.params, dtrain_valid)
preds = bst.predict(dtest)
pred_labels = np.rint(preds)

(tn, fp, fn, tp) = confusion_matrix(y_test, pred_labels).ravel()

precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)



Test set precision_val = 0.8405797101449275
Test set recall_val = 0.7733333333333333
Test set sspecificity = 0.56
Test set mcc_val = 0.3120857859471409
Test set accuracy_val = 0.72
Test set f1_val = 0.8055555555555556
