# Classifiy people as good or bad credit risks.
Uses the German Credit dataset
Dataset Link: https://www.openml.org/search?type=data&sort=runs&id=31&status=active

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
def objective(trial, X_train, X_validation, y_train, y_validation):   
    params = {
    'n_estimators':trial.suggest_int('n_estimators',50, 300,step=10),
    'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1, step=5e-6),
    "algorithm": trial.suggest_categorical("algorithm", ["SAMME","SAMME.R"]),
    "random_state": trial.suggest_categorical("random_state", [47]),
    }

    # Perform cross validation
    model = AdaBoostClassifier(**params).fit(X_train, y_train)
    
    y_predicted = model.predict(X_validation)
    (tn, fp, fn, tp) = confusion_matrix(y_validation, y_predicted).ravel()
    accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
    
    return accuracy_val

In [4]:
start_time = time.time()

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction = "maximize")
func = lambda trial: objective(trial, X_train, X_validation, y_train, y_validation)

# Start optimizing with 100 trials
study.optimize(func, n_trials = 100)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

total_time=27.48681664466858
{'n_estimators': 90, 'learning_rate': 0.9517700000000001, 'algorithm': 'SAMME', 'random_state': 47}
0.7814814814814814


In [5]:
best_trial = study.best_trial
print("  Value: {}".format(best_trial.value))
print("Number of finished trials: ", len(study.trials))
print(f"Best trial: {best_trial.number}")
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

  Value: 0.7814814814814814
Number of finished trials:  100
Best trial: 85
  Params: 
    n_estimators: 90
    learning_rate: 0.9517700000000001
    algorithm: SAMME
    random_state: 47


## Run on the test set and use training and validation sets for training

In [6]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model = AdaBoostClassifier(**best_trial.params).fit(X_train_valid, y_train_valid)

y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set sspecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)



Test set precision_val = 0.8378378378378378
Test set recall_val = 0.8266666666666667
Test set sspecificity = 0.52
Test set mcc_val = 0.34222378222022665
Test set accuracy_val = 0.75
Test set f1_val = 0.8322147651006712


In [7]:
confusion_matrix(y_test, y_predicted)

array([[13, 12],
       [13, 62]], dtype=int64)

In [8]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [9]:
importances = model.feature_importances_
print(importances)
# feature_names_in_ only defined if the X input to fit() is a dataframe and the column names are all strings
#features_names = model.feature_names_in_
#print(features_names)

[0.11431642 0.27415753 0.02714977 0.01367474 0.         0.
 0.00680377 0.15155154 0.03853336 0.06648767 0.01585169 0.01408395
 0.02737214 0.01144528 0.         0.02028223 0.02188864 0.028387
 0.06471119 0.02202179 0.         0.         0.01319449 0.
 0.         0.         0.         0.         0.02117634 0.
 0.03238494 0.         0.         0.         0.         0.
 0.01452551 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.        ]


In [10]:
print(max(importances))
print(min(importances))
print(sum(importances))

0.2741575298319655
0.0
1.0
