In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
random_state=0

In [4]:
def objective(trial):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 10, 250, step = 10),
    "max_depth": trial.suggest_int("max_depth", 1, 15),
    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
    "random_state": random_state,
    }
    
    model = RandomForestClassifier(**params).fit(X_train, y_train)

    y_predicted = model.predict(X_validation)
    (tn, fp, fn, tp) = confusion_matrix(y_validation, y_predicted).ravel()
    f1_val = (2*tp)/(2*tp+fp+fn)
    
    return f1_val


In [5]:
start_time = time.time()

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=100)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

[I 2023-12-20 19:05:45,906] A new study created in memory with name: no-name-f878a89b-e07d-475b-ad77-1b2955912410
[I 2023-12-20 19:05:45,996] Trial 0 finished with value: 0.8141592920353983 and parameters: {'n_estimators': 60, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 6, 'criterion': 'gini'}. Best is trial 0 with value: 0.8141592920353983.
[I 2023-12-20 19:05:46,306] Trial 1 finished with value: 0.8348623853211009 and parameters: {'n_estimators': 210, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 9, 'criterion': 'entropy'}. Best is trial 1 with value: 0.8348623853211009.
[I 2023-12-20 19:05:46,569] Trial 2 finished with value: 0.8484848484848485 and parameters: {'n_estimators': 160, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 4, 'criterion': 'entropy'}. Best is trial 2 with value: 0.8484848484848485.
[I 2023-12-20 19:05:46,720] Trial 3 finished with value: 0.8476190476190476 and parameters: {'n_estimators': 80, 'max_depth': 10, 'min_samp

total_time=28.642510890960693
{'n_estimators': 200, 'max_depth': 13, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'gini'}
0.8647342995169082


In [6]:
best_n_estimators = study.best_params["n_estimators"]
best_criterion = study.best_params["criterion"]
best_max_depth = study.best_params["max_depth"]
best_min_split = study.best_params["min_samples_split"]
best_min_leaf = study.best_params["min_samples_leaf"]

In [7]:
model = RandomForestClassifier(n_estimators=best_n_estimators, criterion=best_criterion, 
                               max_depth=best_max_depth, min_samples_split=best_min_split,
                               min_samples_leaf=best_min_leaf,random_state=random_state)

## Run on the test set and use training and validation sets for training

In [8]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model.fit(X_train_valid, y_train_valid)
print(model.score(X_test, y_test))
y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)

0.76


Test set precision_val = 0.8227848101265823
Test set recall_val = 0.8666666666666667
Test set sspecificity = 0.44
Test set mcc_val = 0.3260198292646311
Test set accuracy_val = 0.76
Test set f1_val = 0.8441558441558441


In [9]:
confusion_matrix(y_test, y_predicted)

array([[11, 14],
       [10, 65]], dtype=int64)

In [10]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [11]:
importances = model.feature_importances_
print(importances)
# feature_names_in_ only defined if the X input to fit() is a dataframe and the column names are all strings
#features_names = model.feature_names_in_
#print(features_names)

[0.07601217 0.10436859 0.07565007 0.02871623 0.01613555 0.0107621
 0.02952602 0.06364013 0.0516244  0.02405506 0.02202449 0.01794548
 0.01279682 0.01299266 0.01255219 0.01262002 0.01563978 0.02078075
 0.0115551  0.01505732 0.01223638 0.0113642  0.01539154 0.01737713
 0.01557286 0.00645667 0.00855814 0.0070481  0.01052539 0.00650479
 0.00367163 0.01296232 0.01123474 0.0066706  0.00835885 0.01124538
 0.00899356 0.01235043 0.00808348 0.00795282 0.01622753 0.00102721
 0.01151787 0.00175818 0.01399564 0.00856466 0.00936383 0.00793481
 0.0117456  0.00509248 0.01110122 0.01049978 0.01244507 0.00190548
 0.00751489 0.01059937 0.00959901 0.00243431 0.00965914]


In [12]:
print(max(importances))
print(min(importances))
print(sum(importances))

0.1043685910494857
0.0010272105806774057
1.0
