# Classifiy people as good or bad credit risks.
Uses the German Credit dataset
Dataset Link: https://www.openml.org/search?type=data&sort=runs&id=31&status=active

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
random_state=0

In [4]:
def objective(trial, X_train, X_validation, y_train, y_validation):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 10, 250, step = 10),
    "max_depth": trial.suggest_int("max_depth", 1, 15),
    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
    "random_state": trial.suggest_categorical("random_state", [42]),
    }
    
    model = RandomForestClassifier(**params).fit(X_train, y_train)

    y_predicted = model.predict(X_validation)
    (tn, fp, fn, tp) = confusion_matrix(y_validation, y_predicted).ravel()
    f1_val = (2*tp)/(2*tp+fp+fn)
    
    return f1_val


In [5]:
start_time = time.time()

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction = "maximize")
func = lambda trial: objective(trial, X_train, X_validation, y_train, y_validation)

# Start optimizing with 100 trials
study.optimize(func, n_trials = 100)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

total_time=18.17320418357849
{'n_estimators': 30, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 5, 'criterion': 'gini', 'random_state': 42}
0.8490566037735849


In [6]:
best_trial = study.best_trial
print("  Value: {}".format(best_trial.value))
print("Number of finished trials: ", len(study.trials))
print(f"Best trial: {best_trial.number}")
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

  Value: 0.8490566037735849
Number of finished trials:  100
Best trial: 29
  Params: 
    n_estimators: 30
    max_depth: 8
    min_samples_split: 3
    min_samples_leaf: 5
    criterion: gini
    random_state: 42


## Run on the test set and use training and validation sets for training

In [7]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model = RandomForestClassifier(**best_trial.params).fit(X_train_valid, y_train_valid)

y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)



Test set precision_val = 0.7931034482758621
Test set recall_val = 0.92
Test set sspecificity = 0.28
Test set mcc_val = 0.2575131013123024
Test set accuracy_val = 0.76
Test set f1_val = 0.8518518518518519


In [8]:
confusion_matrix(y_test, y_predicted)

array([[ 7, 18],
       [ 6, 69]], dtype=int64)

In [9]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [10]:
importances = model.feature_importances_
print(importances)
# feature_names_in_ only defined if the X input to fit() is a dataframe and the column names are all strings
#features_names = model.feature_names_in_
#print(features_names)

[0.07118661 0.10154989 0.07549266 0.02700959 0.01159568 0.00544545
 0.0219488  0.12695823 0.11403072 0.01799799 0.0257877  0.01031955
 0.02548563 0.01549669 0.01118585 0.01825441 0.01928291 0.01963672
 0.01182622 0.01374541 0.00928153 0.00858128 0.01359762 0.02115434
 0.00874216 0.00445646 0.00271325 0.00412184 0.01219184 0.00178696
 0.00103263 0.00830365 0.01112536 0.00146893 0.         0.01108319
 0.00124114 0.01384378 0.00522821 0.0011967  0.01235016 0.0001762
 0.01204076 0.         0.01371029 0.00308029 0.0057011  0.00632551
 0.00374849 0.00053685 0.01042873 0.00708457 0.01031508 0.
 0.00991719 0.0089517  0.00458843 0.         0.00565707]


In [11]:
print(max(importances))
print(min(importances))
print(sum(importances))

0.1269582277265507
0.0
1.0000000000000002
