In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load version 3 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model on the training set and adjust hyperparameters on the validation set

In [3]:
max_features = ["sqrt", "log2"]
criterion = ['gini', 'entropy', 'log_loss']
max_depth = [3,5,7,10,15]
min_samples_split = [2,5,10,15]
min_samples_leaf = [1,2,5,10]

random_state = 42

best_mcc = 0
best_acc = 0
best_recall = 0
best_precision = 0
best_specificity = 0

best_max_features = ""
best_criterion = ""
best_max_depth = 0
best_min_split = 0
best_min_leaf = 0

for max_feats in max_features:
    for c in criterion:
        for md in max_depth:
            for min_split in min_samples_split:
                for min_leaf in min_samples_leaf:

                    model = DecisionTreeClassifier(max_features=max_feats, criterion=c, max_depth=md, min_samples_split=min_split, min_samples_leaf=min_leaf, random_state=random_state)
                    model.fit(X_train, y_train)
                    
                    y_predicted = model.predict(X_validation)

                    (tn, fp, fn, tp) = confusion_matrix(y_validation, y_predicted).ravel()
                    precision_val = float(tp)/float(tp+fp)
                    recall_val = float(tp)/float(tp+fn)
                    specificity_val = float(tn)/float(tn+fp)
                    accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
                    f1_val = (2*tp)/(2*tp+fp+fn)
                    mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))

                    if mcc_val > best_mcc:
                        best_mcc = mcc_val
                        best_acc = accuracy_val
                        best_recall = recall_val
                        best_precision = precision_val
                        best_specificity = specificity_val

                        best_max_features = max_feats
                        best_criterion = c
                        best_max_depth = md
                        best_min_split = min_split
                        best_min_leaf = min_leaf
                    
                        print("mcc_val =", mcc_val)
                        print("accuracy_val =", accuracy_val)

mcc_val = 0.1939733759778879
accuracy_val = 0.6925925925925925
mcc_val = 0.28827082627183004
accuracy_val = 0.7111111111111111


In [4]:
print("\n")
print("best validation precision_val =", precision_val)
print("best validation recall_val =", recall_val)
print("best validation pecificity =", specificity_val)
print("best validation mcc_val =", mcc_val)
print("best validation accuracy_val =", accuracy_val)
print("best_max_features =", best_max_features)
print("best_criterion =", best_criterion)
print("best_max_depth =", best_max_depth)
print("best_min_split =", best_min_split)
print("best_min_leaf =", best_min_leaf)
print("\n\n") 



best validation precision_val = 0.7167381974248928
best validation recall_val = 0.907608695652174
best validation pecificity = 0.23255813953488372
best validation mcc_val = 0.1898998327053899
best validation accuracy_val = 0.6925925925925925
best_max_features = sqrt
best_criterion = gini
best_max_depth = 7
best_min_split = 2
best_min_leaf = 1





## Run on the test set and use training and validation sets for training

In [5]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model = DecisionTreeClassifier(max_features=best_max_features, criterion=best_criterion, max_depth=best_max_depth, min_samples_split=best_min_split, min_samples_leaf=best_min_leaf, random_state=random_state)
model.fit(X_train_valid, y_train_valid)
y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set pecificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)



Test set precision_val = 0.7073170731707317
Test set recall_val = 0.8923076923076924
Test set pecificity = 0.3142857142857143
Test set mcc_val = 0.25648618548791075
Test set accuracy_val = 0.69
Test set f1_val = 0.7891156462585034


In [6]:
confusion_matrix(y_test, y_predicted)

array([[11, 24],
       [ 7, 58]], dtype=int64)

In [7]:
y_test

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1])