In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from credit_g_dataset import get_preprocessed_credit_g_dataset

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 6)

# Load and pre-process version 1 of the dataset credit-g

In [2]:
%%capture
X_train, X_validation, X_test, y_train, y_validation, y_test = get_preprocessed_credit_g_dataset()

## Train the model using the training set and adjust hyperparameters with Optuna using the validation set

In [3]:
def objective(trial):
    params = {
    "n_neighbors": trial.suggest_int("n_neighbors", 10, 250, step = 10),
    "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
    "algorithm": trial.suggest_categorical("algorithm", ["ball_tree", "kd_tree", "brute"]),
    "metric": trial.suggest_categorical("metric", ["cityblock", "euclidean", "l1", "l2", "manhattan"]),
    "n_jobs": -1,
    }
    
    model = KNeighborsClassifier(**params)
    model.fit(X_train, y_train)
    score=model.score(X_validation, y_validation)
    
    return score


In [4]:
start_time = time.time()

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=100)

total_time = time.time()-start_time
print(f"total_time={total_time}")
print(study.best_params)
print(study.best_value)

[I 2023-12-20 16:45:42,417] A new study created in memory with name: no-name-d6e55862-8f48-4466-9312-dbefe0379f86
[I 2023-12-20 16:45:42,560] Trial 0 finished with value: 0.6925925925925925 and parameters: {'n_neighbors': 220, 'weights': 'uniform', 'algorithm': 'brute', 'metric': 'l2'}. Best is trial 0 with value: 0.6925925925925925.
[I 2023-12-20 16:45:42,600] Trial 1 finished with value: 0.6925925925925925 and parameters: {'n_neighbors': 220, 'weights': 'uniform', 'algorithm': 'kd_tree', 'metric': 'euclidean'}. Best is trial 0 with value: 0.6925925925925925.
[I 2023-12-20 16:45:42,625] Trial 2 finished with value: 0.7 and parameters: {'n_neighbors': 90, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'euclidean'}. Best is trial 2 with value: 0.7.
[I 2023-12-20 16:45:42,657] Trial 3 finished with value: 0.6925925925925925 and parameters: {'n_neighbors': 200, 'weights': 'uniform', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 2 with value: 0.7.
[I 2023-12-2

total_time=3.853107213973999
{'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'kd_tree', 'metric': 'manhattan'}
0.7703703703703704


In [5]:
random_state = 50
best_n_neighbors = study.best_params["n_neighbors"]
best_weights = study.best_params["weights"]
best_algorithm = study.best_params["algorithm"]
best_metric = study.best_params["metric"]

In [6]:
model = KNeighborsClassifier(n_neighbors=best_n_neighbors, weights=best_weights, algorithm=best_algorithm, metric=best_metric)

## Run on the test set and use training and validation sets for training

In [7]:
X_train_valid = np.concatenate((X_train, X_validation))
y_train_valid = np.concatenate((y_train, y_validation))

model.fit(X_train_valid, y_train_valid)
print(model.score(X_test, y_test))
y_predicted = model.predict(X_test)

(tn, fp, fn, tp) = confusion_matrix(y_test, y_predicted).ravel()
precision_val = float(tp)/float(tp+fp)
recall_val = float(tp)/float(tp+fn)
specificity_val = float(tn)/float(tn+fp)
accuracy_val = float(tp+tn)/float(tn+fp+fn+tp)
f1_val = (2*tp)/(2*tp+fp+fn)
mcc_val = float((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


print("\n")
print("Test set precision_val =", precision_val)
print("Test set recall_val =", recall_val)
print("Test set specificity =", specificity_val)
print("Test set mcc_val =", mcc_val)
print("Test set accuracy_val =", accuracy_val)
print("Test set f1_val =", f1_val)

0.71


Test set precision_val = 0.8026315789473685
Test set recall_val = 0.8133333333333334
Test set pecificity = 0.4
Test set mcc_val = 0.21629522817435004
Test set accuracy_val = 0.71
Test set f1_val = 0.8079470198675497


In [8]:
confusion_matrix(y_test, y_predicted)

array([[10, 15],
       [14, 61]], dtype=int64)

In [9]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])