# Classifiy people as good or bad credit risks.
Uses the German Credit dataset
Dataset Link: https://www.openml.org/search?type=data&sort=runs&id=31&status=active

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
random_state=0

In [4]:
def objective(trial, X_train, X_valid, y_train, y_valid):
    
    params = {
    "objective":"binary:logistic",
    "max_depth": trial.suggest_int("max_depth", 1, 20, step=1),
    "n_estimators": trial.suggest_int("n_estimators", 10, 1000, step=10),
    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1, step=1e-6, log=False),
    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
    "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }
   
    model = XGBClassifier(**params,random_state=0).fit(X_train, y_train)
    y_predicted = model.predict(X_valid)
    (tn, fp, fn, tp) = confusion_matrix(y_valid, y_predicted).ravel()
    accuracy = float(tp+tn)/float(tn+fp+fn+tp)
    
    return accuracy

In [5]:
start_time = time.time()

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction = "maximize")
func = lambda trial: objective(trial, X_train, X_validation, y_train, y_validation)

# Start optimizing with 150 trials
study.optimize(func, n_trials = 200, timeout=600)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

total_time=26.296910524368286
{'max_depth': 12, 'n_estimators': 170, 'learning_rate': 0.052801, 'subsample': 0.42621926596843157, 'colsample_bytree': 0.5429604517097264, 'min_child_weight': 5}
0.7888888888888889


In [6]:
best_trial = study.best_trial
print("  Value: {}".format(best_trial.value))
print("Number of finished trials: ", len(study.trials))
print(f"Best trial: {best_trial.number}")
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

  Value: 0.7888888888888889
Number of finished trials:  200
Best trial: 144
  Params: 
    max_depth: 12
    n_estimators: 170
    learning_rate: 0.052801
    subsample: 0.42621926596843157
    colsample_bytree: 0.5429604517097264
    min_child_weight: 5


## Run on the test set and use training and validation sets for training

In [7]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model = XGBClassifier(**best_trial.params,random_state=0).fit(X_validation, y_validation)
y_predicted = model.predict(X_test)

print(np.unique(y_predicted))

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))

precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)

[0 1]


Test set precision_val = 0.8421052631578947
Test set recall_val = 0.8533333333333334
Test set sspecificity = 0.52
Test set mcc_val = 0.3785166493051126
Test set accuracy_val = 0.77
Test set f1_val = 0.847682119205298
