In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [None]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [None]:
def objective(trial):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 10, 250, step = 10),
    "max_depth": trial.suggest_int("max_depth", 1, 15),
    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
    "random_state": 50,
    }
    
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    score=model.score(X_validation, y_validation)
    
    return score


In [None]:
start_time = time.time()

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=100)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

In [None]:
random_state = 50
best_n_estimators = study.best_params["n_estimators"]
best_criterion = study.best_params["criterion"]
best_max_depth = study.best_params["max_depth"]
best_min_split = study.best_params["min_samples_split"]
best_min_leaf = study.best_params["min_samples_leaf"]

In [None]:
model = RandomForestClassifier(n_estimators=best_n_estimators, criterion=best_criterion, 
                               max_depth=best_max_depth, min_samples_split=best_min_split,
                               min_samples_leaf=best_min_leaf,random_state=random_state)

## Run on the test set and use training and validation sets for training

In [None]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model.fit(X_train_valid, y_train_valid)
print(model.score(X_test, y_test))
y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)

In [None]:
confusion_matrix(y_test, y_predicted)

In [None]:
y_test

In [None]:
importances = model.feature_importances_
print(importances)
# feature_names_in_ only defined if the X input to fit() is a dataframe and the column names are all strings
#features_names = model.feature_names_in_
#print(features_names)

In [None]:
print(max(importances))
print(min(importances))
print(sum(importances))