In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load version 3 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model on the training set and adjust hyperparameters on the validation set

In [3]:
penalty_ls= ['l1', 'l2', 'elasticnet', None]
solver_ls = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

max_acc = 0
for solver in solver_ls:
    model = LogisticRegression(random_state=0, max_iter=10000, n_jobs=-1, solver = solver).fit(X_train, y_train)
    y_predicted = model.predict(X_validation)
    
    # Get performance metrics
    accuracy = accuracy_score(y_validation, y_predicted)
    recall = recall_score(y_validation, y_predicted,  average='binary')
    precision = precision_score(y_validation, y_predicted, average='binary')
    f1 = f1_score(y_validation, y_predicted, average='binary')

    if accuracy > max_acc:
        max_acc = accuracy
        best_solver = solver 
        print("Solver =", solver)


print(f"Best Accuracy was {max_acc}, obtained with the solver {best_solver}")

Solver = lbfgs




Best Accuracy was 0.7111111111111111, obtained with the solver lbfgs


## Run on the test set and use training and validation sets for training

In [4]:
X_train = np.concatenate((X_train, X_validation))
y_train = np.concatenate((y_train, y_validation))
model = LogisticRegression(random_state=0, max_iter=10000, n_jobs=-1, solver = best_solver).fit(X_train, y_train)
y_predicted = model.predict(X_test)

# Get performance metrics
accuracy = accuracy_score(y_test, y_predicted)
recall = recall_score(y_test, y_predicted,  average='binary')
precision = precision_score(y_test, y_predicted, average='binary')
f1 = f1_score(y_test, y_predicted, average='binary')

print("accuracy =", accuracy)
print("precision =", precision)
print("recall =", recall)
print("f1 =", f1)

tn, fp, fn, tp = confusion_matrix(y_test, y_predicted).ravel()
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = 2*tp/(2*tp+fp+fn)
sensitivity_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
precision_val = float(tp)/float(tp+fp)

print("\n\nSelf calculated metrics")
print("accuracy=", accuracy_val)
print("precision =", precision_val)
print("recall =", sensitivity_val)
print("F1 =", f1_val)

accuracy = 0.68
precision = 0.6896551724137931
recall = 0.9230769230769231
f1 = 0.7894736842105263


Self calculated metrics
accuracy= 0.68
precision = 0.6896551724137931
recall = 0.9230769230769231
F1 = 0.7894736842105263


In [5]:
confusion_matrix(y_test, y_predicted)

array([[ 8, 27],
       [ 5, 60]], dtype=int64)

In [6]:
y_test

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1])