In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import fbeta_score, confusion_matrix, roc_auc_score, f1_score, brier_score_loss

from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

In [60]:
DATA_DIRECTORY = "/home/fehrdelt/data_ssd/data/clinical_data/Full/"

In [61]:
X = df = pd.read_csv(DATA_DIRECTORY+"combined_clinical_data_volumes_outcome_TTS_ANTS_hist_match.csv", usecols=range(2,31))
y = df = pd.read_csv(DATA_DIRECTORY+"combined_clinical_data_volumes_outcome_TTS_ANTS_hist_match.csv", usecols=[31])
#X.head()
#y.head()

In [62]:
nan_indexes = y.loc[pd.isna(y["outcome_neurochir_pic"]), :].index # indexes where there is a nan value.

y = y.dropna()
X_dropped = X.drop(nan_indexes)

y = y['outcome_neurochir_pic'].to_numpy()
y = [int(i) for i in y]

In [None]:
# train models with the best hyperparameters using cross validation
pipeline = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), 
                           #('feature_selection', SelectKBest(mutual_info_classif, k=8)), 
                           ('imbalance', SMOTEENN()), 
                           ('model', GradientBoostingClassifier(learning_rate=1.0023, max_depth=3, min_samples_leaf=3, min_samples_split=4, n_estimators=143))])


y_pred = cross_val_predict(pipeline, X_dropped, y, cv=20, method='predict_proba')[:,1]

y_pred_binary = [1 if i>=0.5 else 0 for i in y_pred]

#### Confidence Intervals: Bootstrapping the Test Set Predictions 

https://sebastianraschka.com/blog/2022/confidence-intervals-for-ml.html#method-3-bootstrapping-the-test-set-predictions

Here, we pick our lower and upper confidence bounds as follows:

$\text{Metric}_{lower}=\alpha_{1}\text{th}$ percentile of the $\text{Metric}_{boot}$ distribution; \
$\text{Metric}_{upper}=\alpha_{2}\text{th}$ percentile of the $\text{Metric}_{boot}$ distribution;


Where $\alpha_1 = \alpha$ and $\alpha_2 = 1-\alpha$ \
And $\alpha$ is our degree of confidence to compute the $100*(1−2\alpha)$ confidence interval. \
For instance, to compute a $95\%$ confidence interval, we pick $\alpha=0.025$ to obtain the 2.5th and 97.5th percentiles of the b bootstrap samples distribution as our upper and lower confidence bounds.

In [None]:
rng = np.random.RandomState(seed=12345)
idx = np.arange(len(y))

y_pred = np.asarray(y_pred)
y_pred_binary = np.asarray(y_pred_binary)
y = np.asarray(y)
X = np.asarray(X)
X_dropped = np.asarray(X_dropped)

test_roc_auc = []
test_f1 = []
test_ftwos = []
test_brier = []
test_false_neg = []
test_false_pos = []

for i in range(200): # bootstrap with 200 rounds: random sampling with replacement of the predictions

    pred_idx = rng.choice(idx, size=len(idx), replace=True)
    
    roc_auc_test_boot = roc_auc_score(y_score=y_pred[pred_idx], y_true=y[pred_idx])
    f1_test_boot = f1_score(y_pred=y_pred_binary[pred_idx], y_true=y[pred_idx])
    f2_test_boot = fbeta_score(y_pred=y_pred_binary[pred_idx], y_true=y[pred_idx], beta=2)
    brier_test_boot = brier_score_loss(y_proba=y_pred[pred_idx], y_true=y[pred_idx])
    false_neg_test_boot = confusion_matrix(y[pred_idx], y_pred_binary[pred_idx])[1,0]
    false_pos_test_boot = confusion_matrix(y[pred_idx], y_pred_binary[pred_idx])[0,1]
    
    test_roc_auc.append(roc_auc_test_boot)
    test_f1.append(f1_test_boot)
    test_ftwos.append(f2_test_boot)
    test_brier.append(brier_test_boot)
    test_false_neg.append(false_neg_test_boot/len(idx)*100)
    test_false_pos.append(false_pos_test_boot/len(idx)*100)


In [65]:
print("Classification performance\n")

bootstrap_roc_auc_test_mean = np.mean(test_roc_auc)
ci_lower = np.percentile(test_roc_auc, 2.5)     # 2.5 percentile (alpha=0.025)
ci_upper = np.percentile(test_roc_auc, 97.5)
print(f"ROC AUC:         {bootstrap_roc_auc_test_mean:.2f};   95% CI {ci_lower:.2f}-{ci_upper:.2f}")

bootstrap_f1_test_mean = np.mean(test_f1)
ci_lower = np.percentile(test_f1, 2.5)
ci_upper = np.percentile(test_f1, 97.5)
print(f"F1:              {bootstrap_f1_test_mean:.2f};   95% CI {ci_lower:.2f}-{ci_upper:.2f}")

bootstrap_f2_test_mean = np.mean(test_ftwos)
ci_lower = np.percentile(test_ftwos, 2.5)
ci_upper = np.percentile(test_ftwos, 97.5)
print(f"F2:              {bootstrap_f2_test_mean:.2f};   95% CI {ci_lower:.2f}-{ci_upper:.2f}")

bootstrap_brier_test_mean = np.mean(test_brier)
ci_lower = np.percentile(test_brier, 2.5)
ci_upper = np.percentile(test_brier, 97.5)
print(f"Brier loss:      {bootstrap_brier_test_mean:.2f};   95% CI {ci_lower:.2f}-{ci_upper:.2f}")

bootstrap_false_neg_test_mean = np.mean(test_false_neg)
ci_lower = np.percentile(test_false_neg, 2.5)
ci_upper = np.percentile(test_false_neg, 97.5)
print(f"False negatives: {bootstrap_false_neg_test_mean:.2f}%;  95% CI {ci_lower:.2f}-{ci_upper:.2f}")

bootstrap_false_pos_test_mean = np.mean(test_false_pos)
ci_lower = np.percentile(test_false_pos, 2.5)
ci_upper = np.percentile(test_false_pos, 97.5)
print(f"False negatives: {bootstrap_false_pos_test_mean:.2f}%; 95% CI {ci_lower:.2f}-{ci_upper:.2f}")

Classification performance

ROC AUC:         0.89;   95% CI 0.86-0.92
F1:              0.45;   95% CI 0.34-0.55
F2:              0.55;   95% CI 0.44-0.67
Brier loss:      0.13;   95% CI 0.10-0.15
False negatives: 3.06%;  95% CI 1.41-4.65
False negatives: 10.49%; 95% CI 7.88-13.33
