In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
random_state=0

In [4]:
def objective(trial, X_train, X_validation, y_train, y_validation):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 10, 250, step = 10),
    "max_depth": trial.suggest_int("max_depth", 1, 15),
    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
    "random_state": random_state,
    }
    
    model = RandomForestClassifier(**params).fit(X_train, y_train)

    y_predicted = model.predict(X_validation)
    (tn, fp, fn, tp) = confusion_matrix(y_validation, y_predicted).ravel()
    f1_val = (2*tp)/(2*tp+fp+fn)
    
    return f1_val


In [5]:
start_time = time.time()

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction = "maximize")
func = lambda trial: objective(trial, X_train, X_validation, y_train, y_validation)

# Start optimizing with 100 trials
study.optimize(func, n_trials = 100)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

total_time=31.311628103256226
{'n_estimators': 160, 'max_depth': 15, 'min_samples_split': 8, 'min_samples_leaf': 2, 'criterion': 'gini'}
0.8598574821852731


In [6]:
best_trial = study.best_trial
print("  Value: {}".format(best_trial.value))
print("Number of finished trials: ", len(study.trials))
print(f"Best trial: {best_trial.number}")
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

  Value: 0.8598574821852731
Number of finished trials:  100
Best trial: 47
  Params: 
    n_estimators: 160
    max_depth: 15
    min_samples_split: 8
    min_samples_leaf: 2
    criterion: gini


## Run on the test set and use training and validation sets for training

In [7]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model = RandomForestClassifier(**best_trial.params).fit(X_train_valid, y_train_valid)

y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)



Test set precision_val = 0.825
Test set recall_val = 0.88
Test set sspecificity = 0.44
Test set mcc_val = 0.34641016151377546
Test set accuracy_val = 0.77
Test set f1_val = 0.8516129032258064


In [8]:
confusion_matrix(y_test, y_predicted)

array([[11, 14],
       [ 9, 66]], dtype=int64)

In [9]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [10]:
importances = model.feature_importances_
print(importances)
# feature_names_in_ only defined if the X input to fit() is a dataframe and the column names are all strings
#features_names = model.feature_names_in_
#print(features_names)

[0.0826634  0.10426007 0.07044745 0.02916108 0.01369419 0.00744812
 0.02975434 0.08183255 0.06280758 0.02388608 0.02611465 0.01723954
 0.01548264 0.01401793 0.01323187 0.01541288 0.01719687 0.02196203
 0.01150203 0.01537395 0.01178876 0.01240205 0.01361116 0.01780674
 0.01710724 0.00573186 0.00652636 0.00608717 0.00933562 0.00497323
 0.00176972 0.01363576 0.01231011 0.00521005 0.00684029 0.01149616
 0.00830788 0.01276723 0.00847192 0.00454698 0.01365851 0.00026428
 0.01182965 0.00103605 0.01206642 0.00593408 0.00763527 0.00737089
 0.01032068 0.00167019 0.01148888 0.00765465 0.01110692 0.00114883
 0.00546713 0.00973726 0.00896931 0.00123037 0.00719507]


In [11]:
print(max(importances))
print(min(importances))
print(sum(importances))

0.10426006811434832
0.0002642811112449653
0.9999999999999999
