In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
random_state=0

In [4]:
def objective(trial, X_train, X_validation, y_train, y_validation):
    """
    An objective function to tune hyperparameters of Gradient Boosting Classifier.
    Args:
    trial: an Optuna trial
    X: DataFrame object, features
    y: Series object, Labels
    cv: k folds to cross-validate
    scoring: String, evaluation metric
    Return:
    Mean test accuracy
    """
    
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 100, 5000, step = 100),
    "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log = True),
    "max_depth": trial.suggest_int("max_depth", 3, 9),
    "subsample": trial.suggest_float("subsample", 0.5, 0.9, step = 0.1),
    "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
    "random_state": 42,
    }
    # Perform cross validation
    model = GradientBoostingClassifier(**params).fit(X_train, y_train)
    
    y_predicted = model.predict(X_validation)
    (tn, fp, fn, tp) = confusion_matrix(y_validation, y_predicted).ravel()
    accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
    
    return accuracy_val

In [5]:
start_time = time.time()

study = optuna.create_study(direction = "maximize")
func = lambda trial: objective(trial, X_train, X_validation, y_train, y_validation)

# Start optimizing with 100 trials
study.optimize(func, n_trials = 100)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

[I 2023-12-20 22:13:58,853] A new study created in memory with name: no-name-46851ec5-f518-436e-b4c9-3a7fdace6580
[I 2023-12-20 22:14:02,869] Trial 0 finished with value: 0.774074074074074 and parameters: {'n_estimators': 2100, 'learning_rate': 0.05605345200320826, 'max_depth': 8, 'subsample': 0.9, 'max_features': 'log2'}. Best is trial 0 with value: 0.774074074074074.
[I 2023-12-20 22:14:04,438] Trial 1 finished with value: 0.7814814814814814 and parameters: {'n_estimators': 1200, 'learning_rate': 0.005852742812457334, 'max_depth': 5, 'subsample': 0.5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7814814814814814.
[I 2023-12-20 22:14:11,167] Trial 2 finished with value: 0.7666666666666667 and parameters: {'n_estimators': 5000, 'learning_rate': 0.00554202539364183, 'max_depth': 5, 'subsample': 0.7, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7814814814814814.
[I 2023-12-20 22:14:13,048] Trial 3 finished with value: 0.6925925925925925 and parameters: {'n_estimators'

total_time=244.3916847705841
{'n_estimators': 2100, 'learning_rate': 0.006215888122200246, 'max_depth': 3, 'subsample': 0.7, 'max_features': 'log2'}
0.8037037037037037


In [6]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")

trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  100
Best trial:
  Value: 0.8037037037037037
  Params: 
    n_estimators: 2100
    learning_rate: 0.006215888122200246
    max_depth: 3
    subsample: 0.7
    max_features: log2


## Run on the test set and use training and validation sets for training

In [7]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model = GradientBoostingClassifier(**trial.params).fit(X_train_valid, y_train_valid)

y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)



Test set precision_val = 0.8289473684210527
Test set recall_val = 0.84
Test set sspecificity = 0.48
Test set mcc_val = 0.3244428422615251
Test set accuracy_val = 0.75
Test set f1_val = 0.8344370860927153


In [8]:
confusion_matrix(y_test, y_predicted)

array([[12, 13],
       [12, 63]], dtype=int64)

In [9]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [10]:
importances = model.feature_importances_
print(importances)
# feature_names_in_ only defined if the X input to fit() is a dataframe and the column names are all strings
#features_names = model.feature_names_in_
#print(features_names)

[0.09034174 0.10224359 0.0548507  0.02320042 0.00985598 0.00881832
 0.01939404 0.09436897 0.07246609 0.03127701 0.02819849 0.02323307
 0.01463425 0.02103446 0.01644675 0.01462979 0.01525176 0.02716214
 0.02317408 0.01867683 0.01200677 0.00765213 0.01602446 0.01120504
 0.01343616 0.00835592 0.0075843  0.00870021 0.01290091 0.0042285
 0.00671562 0.00595453 0.00715594 0.00726762 0.00944689 0.00541606
 0.01266026 0.00983511 0.00937029 0.00867489 0.00974883 0.00279088
 0.00575191 0.00377655 0.00678567 0.00578846 0.00755123 0.00494917
 0.00561395 0.00520032 0.00588173 0.00702145 0.00470298 0.00247348
 0.0041605  0.00479377 0.00522176 0.00340791 0.01052936]


In [11]:
print(max(importances))
print(min(importances))
print(sum(importances))

0.10224359141057382
0.0024734837147451358
0.9999999999999999
