Generate each set

In [None]:
# import data.xgb_data_preprocessing
# from data.xgb_data_preprocessing import generate_dataset, aneurisk_aneurist, hug2016, hug2016snf

Go to data.xgb_data_preprocessing and run these:

In [None]:
# generate_dataset(which_df=aneurisk_aneurist, cut_type='ninja')
# generate_dataset(which_df=hug2016, cut_type='ninja')
# generate_dataset(which_df=hug2016snf, cut_type='ninja')

Read in train set

In [None]:
cut_type='ninja'

In [None]:
import numpy as np
import xgboost as xgb

In [None]:
X_train = np.load(file=f'./data/xgb/{cut_type}_X_train_ndarray.npy')
y_train = np.load(file=f'./data/xgb/{cut_type}_y_train_ndarray.npy')

In [None]:
# X_val = np.load(file=f'./data/xgb/{cut_type}_X_val_ndarray.npy')
# y_val = np.load(file=f'./data/xgb/{cut_type}_y_val_ndarray.npy')

In [None]:
X_train.shape

In [None]:
y_train.shape

Construct model

In [None]:
xgb.set_config(use_rmm=True)
current_cfg = xgb.get_config()
assert current_cfg['use_rmm'] is True
parameters = {'verbosity': 1, 'random_state': 7,
              'eval_metric': 'auc', 'tree_method': 'gpu_hist', 'objective': 'binary:logistic'
              }  # 'num_parallel_tree': 8, 'n_jobs': -1,  'booster': 'gblinear'
model = xgb.XGBClassifier(**parameters)

Grid Search Parameter (5-fold) -> Find best model

In [None]:
import time
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

In [None]:
tic = time.process_time()
grid_search = GridSearchCV(estimator=model,
                       param_grid={'learning_rate': [0.025, 0.05, 0.075],
                                   'max_depth': [3, 5],
                                   'subsample': [0.8, 0.85, 0.9, 0.95]},
                       scoring='roc_auc', n_jobs=-1, verbose=1, error_score='raise')  # n_jobs, pre_dispatch, randon_state?
print('********************** Fitting model on train set **********************')
grid_search.fit(X=X_train, y=y_train)
toc = time.process_time()
processed_time = toc - tic
print('Fitting time = ' + str(processed_time) + ' s.')
print('Note: The number of cross-validation splits: ' + str(grid_search.n_splits_))
print('Note: Seconds used for refitting the best model on the whole dataset: ' + str(grid_search.refit_time_))
print('Mean cross-validated AUC score of the best_estimator: ')
print(grid_search.best_score_)
print('Note: Parameter setting that gave the best results on the hold out data:')
print(grid_search.best_params_)
best_model = grid_search.best_estimator_
dump(value=best_model, filename=f'./checkpoints/best_xgb_model_{cut_type}.joblib')

Test model

Read test set

In [None]:
X_test = np.load(file=f'./data/xgb/{cut_type}_X_test_ndarray.npy')
y_test = np.load(file=f'./data/xgb/{cut_type}_y_test_ndarray.npy')

In [None]:
best_model = load(filename=f'./checkpoints/best_xgb_model_{cut_type}.joblib')

In [None]:
tic = time.process_time()
y_hat = best_model.predict(X_test)
toc = time.process_time()
processed_time = toc - tic
print('Predicting time = ' + str(processed_time) + ' s.')

Result

Classification Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_true=y_test, y_pred=y_hat))

Feature Importance

In [None]:
from numpy.lib.function_base import flip
from numpy.core.fromnumeric import argsort

top_10_features=flip(argsort(best_model.feature_importances_),axis=0)[:10]
print(top_10_features)

Figure

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
import os
import pickle

In [None]:
# Predict probabilities of the positive class
y_probs = best_model.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = roc_auc_score(y_test, y_probs)

roc_data=[(fpr, tpr, roc_auc)]

os.makedirs(name='./results', exist_ok=True)
with open(f'./results/xgb_roc_data_{cut_type}.pkl', 'wb') as f:
        pickle.dump((fpr, tpr, roc_auc), f)

In [None]:
# with open('results/xgb_roc_data_ninja.pkl', 'rb') as f:
#         fpr, tpr, roc_auc = pickle.load(f)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")

# Show the plot
plt.show()