In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.svm import SVC
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load version 3 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model on the training set and adjust hyperparameters on the validation set

In [3]:
kernel = 'rbf'
C_ls = np.arange(0.1, 1, 0.01)

max_acc = 0
best_C = 0
for C in C_ls:
    model = SVC(kernel=kernel, C=C, gamma='auto').fit(X_train, y_train)
    
    y_predicted = model.predict(X_validation)
        
    # Get performance metrics
    accuracy = accuracy_score(y_validation, y_predicted)
    recall = recall_score(y_validation, y_predicted,  average='binary')
    precision = precision_score(y_validation, y_predicted, average='binary')
    f1 = f1_score(y_validation, y_predicted, average='binary')
    
    if accuracy > max_acc:
        max_acc = accuracy
        best_C = C


model = SVC(kernel=kernel, C=best_C, gamma='auto').fit(X_train, y_train)
y_predicted = model.predict(X_validation)
# Get performance metrics
accuracy = accuracy_score(y_validation, y_predicted)
recall = recall_score(y_validation, y_predicted,  average='binary')
precision = precision_score(y_validation, y_predicted, average='binary')
f1 = f1_score(y_validation, y_predicted, average='binary')

print("Validation Set performance")
print("Best C:", best_C)
print("accuracy =", accuracy)
print("precision =", precision)
print("recall =", recall)
print("f1 =", f1)

Validation Set performance
Best C: 0.15999999999999998
accuracy = 0.7037037037037037
precision = 0.708
recall = 0.9619565217391305
f1 = 0.8156682027649769


## Run on the test set and use training and validation sets for training

In [4]:
X_train = np.concatenate((X_train, X_validation))
y_train = np.concatenate((y_train, y_validation))
model = SVC(kernel=kernel, C=best_C, gamma='auto').fit(X_train, y_train)
y_predicted = model.predict(X_test)

# Get performance metrics
accuracy = accuracy_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted, average='binary')
recall = recall_score(y_test, y_predicted,  average='binary')
f1 = f1_score(y_test, y_predicted, average='binary')

print("accuracy =", accuracy)
print("precision =", precision)
print("recall =", recall)
print("f1 =", f1)

accuracy = 0.68
precision = 0.6813186813186813
recall = 0.9538461538461539
f1 = 0.7948717948717949


In [5]:
confusion_matrix(y_test, y_predicted)

array([[ 6, 29],
       [ 3, 62]], dtype=int64)

In [6]:
y_test

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1])