### Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler

%matplotlib inline

### Loading and preparing the dataset

### Utility

In [None]:
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix

ftwo_scorer = make_scorer(fbeta_score, beta=2)

def confusion_matrix_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)

     return {'tn': cm[0, 0], 'fp': cm[0, 1],
             'fn': cm[1, 0], 'tp': cm[1, 1]}

def false_neg_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)
     
     return cm[1, 0]

def false_pos_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)
     
     return cm[0, 1]

In [None]:
FOLDS = 5
N_REPEATS = 3
nb_total_samples = len(y)

In [None]:
DATA_DIRECTORY = ""

In [None]:
cleaned_data_full = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", skiprows=1)
cleaned_data_full.head()

In [None]:
X_volumes = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", usecols=range(71,79), skiprows=1)
X_volumes.head()

In [None]:
# Marshal preprocessing
X_volumes['Marshal'] = X_volumes['Marshal'].replace({
    "I diffuse injury (no visible pathology)": 1,
    "II diffuse injury (midline shift <5mm, basal cisterns visible,no high or mixed density lesion > 25 cm3)": 2,
    "III diffuse injury (swelling, midline shift of 0 to 5 mm, basal cisterns compressed or completely effaced, no high or mixed density lesions >25 cm3)": 3,
    "IV diffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "IVdiffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "V evacuated mass lesion (any lesion evacuated surgically)": 5,
    "VI non-evacuated mass lesion (high or mixed density lesions >25 cm3, not surgically evacuated)": 6, 
    "0": 0,  # if"0" is an object
    "1": 1,  
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6
})

# Replace NA or 0 by 1
X_volumes['Marshal'] = X_volumes['Marshal'].apply(lambda x: 1 if pd.isna(x) or x == 0 else x)

print(X_volumes['Marshal'].value_counts())

In [None]:
y = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", skiprows=1, usecols=[90])
y.head()

In [None]:
X_volumes_imputed = X_volumes.fillna(0)

### Mortalité J7

In [None]:
nan_and_nd_indexes = y.loc[(pd.isna(y["mortalité J7"])) | (y["mortalité J7"] == "nd"), :].index  # indexes where there is a nan value.
print(nan_and_nd_indexes)

y = y.drop(nan_and_nd_indexes)
X_volumes = X_volumes.drop(nan_and_nd_indexes)

In [None]:
# Convert in number
y["mortalité J7"] = pd.to_numeric(y["mortalité J7"], errors="coerce") 

# Outcome event
event_count = (y == 1.00).sum()
print(f"outcome events : {event_count}")

In [None]:
y = y['mortalité J7'].to_numpy()
y = [int(i) for i in y]

#### XGB with hyperparameters 

In [None]:
pipeline_smote_under = Pipeline(steps=[('over', SMOTE()), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])
#pipeline_smote_under = Pipeline(steps=[('over', SMOTENC(categorical_features=["fracas_du_bassin", "amputation"])), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])


inner_cv = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=5, random_state=1)

p_grid = {"model__learning_rate": [0.01, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1], "over__sampling_strategy": [0.1, 0.2, 0.3], "over__k_neighbors":[3,5,8], "under__sampling_strategy":[0.3, 0.5, 0.7]}
clf = GridSearchCV(estimator=pipeline_smote_under, param_grid=p_grid, scoring={'F2':ftwo_scorer}, refit='F2', cv=inner_cv)

outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

nested_scores_smote_undersampling = cross_validate(clf, X_volumes_imputed, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, cv=outer_cv, n_jobs=-1)

print("Prehospital data & segmentation volumes: HistGradientBoostingClassifier with hyperparameter gridsearch")

roc_auc_metric = np.mean(nested_scores_smote_undersampling["test_ROC_AUC"])
roc_auc_metric_std = np.std(nested_scores_smote_undersampling["test_ROC_AUC"])
print(f'AUC (max): {np.round(roc_auc_metric, 2)} +- {np.round(roc_auc_metric_std, 2)}')

f1_score = np.mean(nested_scores_smote_undersampling["test_F1"])
f1_score_std = np.std(nested_scores_smote_undersampling["test_F1"])
print(f'F1 Score (max): {np.round(f1_score, 2)} +- {np.round(f1_score_std, 2)}')

f2_score = np.mean(nested_scores_smote_undersampling["test_F2"])
f2_score_std = np.std(nested_scores_smote_undersampling["test_F2"])
print(f'F2 Score (max): {np.round(f2_score, 2)} +- {np.round(f2_score_std, 2)}')

brier_score = -np.mean(nested_scores_smote_undersampling["test_Brier"])
brier_score_std = -np.std(nested_scores_smote_undersampling["test_Brier"])
print(f'Brier Score (min): {np.round(brier_score, 2)} +- {np.round(brier_score_std, 2)}')

# test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
false_neg_score = np.mean(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
false_neg_score_std = np.std(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
print(f'False negative: {int(np.round(false_neg_score, 0))}% +- {int(np.round(false_neg_score_std, 0))}')

false_pos_score = np.mean(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
false_pos_score_std = np.std(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
print(f'False positive: {int(np.round(false_pos_score, 0))}% +- {int(np.round(false_pos_score_std, 0))}')


### Mortalité J30

In [None]:
# Load columns mortality
mortality_column = pd.read_csv(DATA_DIRECTORY + "cleaned_data_full.csv", skiprows=1, usecols=[95, 96, 97])

# If column mortalité J7 has a 1, set column mortalité J30 to 1; otherwise, leave column mortalité J30 as is
mortality_column.iloc[:, 1] = mortality_column.apply(
    lambda row: 1 if str(row.iloc[0]).strip() == "1" else row.iloc[1], axis=1
)

# Drop column 95, keeping only column 96
y = mortality_column.iloc[:, [1]]

# Convert in number
y["mortalité J30"] = pd.to_numeric(y["mortalité J30"], errors="coerce") 

# Outcome event
event_count = (y == 1).sum()
print(f"outcome events : {event_count}")

In [None]:
nan_and_nd_indexes = y.loc[(pd.isna(y["mortalité J30"])) | (y["mortalité J30"] == "nd"), :].index  # indexes where there is a nan value.
print(nan_indexes)

y = y.drop(nan_and_nd_indexes)
X_volumes = X_volumes.drop(nan_and_nd_indexes)

y = y['mortalité J30'].to_numpy()
y = [int(i) for i in y]

In [None]:
### XGB ###

pipeline_smote_under = Pipeline(steps=[('over', SMOTE()), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])
#pipeline_smote_under = Pipeline(steps=[('over', SMOTENC(categorical_features=["fracas_du_bassin", "amputation"])), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])


inner_cv = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=5, random_state=1)

p_grid = {"model__learning_rate": [0.01, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1], "over__sampling_strategy": [0.1, 0.2, 0.3], "over__k_neighbors":[3,5,8], "under__sampling_strategy":[0.3, 0.5, 0.7]}
clf = GridSearchCV(estimator=pipeline_smote_under, param_grid=p_grid, scoring={'F2':ftwo_scorer}, refit='F2', cv=inner_cv)

outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

nested_scores_smote_undersampling = cross_validate(clf, X_volumes_imputed, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, cv=outer_cv, n_jobs=-1)

print("Prehospital data & segmentation volumes: HistGradientBoostingClassifier with hyperparameter gridsearch")

roc_auc_metric = np.mean(nested_scores_smote_undersampling["test_ROC_AUC"])
roc_auc_metric_std = np.std(nested_scores_smote_undersampling["test_ROC_AUC"])
print(f'AUC (max): {np.round(roc_auc_metric, 2)} +- {np.round(roc_auc_metric_std, 2)}')

f1_score = np.mean(nested_scores_smote_undersampling["test_F1"])
f1_score_std = np.std(nested_scores_smote_undersampling["test_F1"])
print(f'F1 Score (max): {np.round(f1_score, 2)} +- {np.round(f1_score_std, 2)}')

f2_score = np.mean(nested_scores_smote_undersampling["test_F2"])
f2_score_std = np.std(nested_scores_smote_undersampling["test_F2"])
print(f'F2 Score (max): {np.round(f2_score, 2)} +- {np.round(f2_score_std, 2)}')

brier_score = -np.mean(nested_scores_smote_undersampling["test_Brier"])
brier_score_std = -np.std(nested_scores_smote_undersampling["test_Brier"])
print(f'Brier Score (min): {np.round(brier_score, 2)} +- {np.round(brier_score_std, 2)}')

# test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
false_neg_score = np.mean(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
false_neg_score_std = np.std(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
print(f'False negative: {int(np.round(false_neg_score, 0))}% +- {int(np.round(false_neg_score_std, 0))}')

false_pos_score = np.mean(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
false_pos_score_std = np.std(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
print(f'False positive: {int(np.round(false_pos_score, 0))}% +- {int(np.round(false_pos_score_std, 0))}')

### Mortalité 6 mois

In [None]:
y = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", skiprows=1, usecols=[92])
y.head()

X_volumes = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", usecols=range(71,79), skiprows=1)
X_volumes.head()

In [None]:
# Marshal preprocessing
X_volumes['Marshal'] = X_volumes['Marshal'].replace({
    "I diffuse injury (no visible pathology)": 1,
    "II diffuse injury (midline shift <5mm, basal cisterns visible,no high or mixed density lesion > 25 cm3)": 2,
    "III diffuse injury (swelling, midline shift of 0 to 5 mm, basal cisterns compressed or completely effaced, no high or mixed density lesions >25 cm3)": 3,
    "IV diffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "IVdiffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "V evacuated mass lesion (any lesion evacuated surgically)": 5,
    "VI non-evacuated mass lesion (high or mixed density lesions >25 cm3, not surgically evacuated)": 6, 
    "0": 0,  # if"0" is an object
    "1": 1,  
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6
})

# Replace NA or 0 by 1
X_volumes['Marshal'] = X_volumes['Marshal'].apply(lambda x: 1 if pd.isna(x) or x == 0 else x)

print(X_volumes['Marshal'].value_counts())

In [None]:
nan_and_nd_indexes = y.loc[(pd.isna(y["Mortalité 6 mois"])) | (y["Mortalité 6 mois"] == "nd"), :].index  # indexes where there is a nan value.
print(nan_and_nd_indexes)

y = y.drop(nan_and_nd_indexes)
X_volumes = X_volumes.drop(nan_and_nd_indexes)

In [None]:
# Convert in number
y["Mortalité 6 mois"] = pd.to_numeric(y["Mortalité 6 mois"], errors="coerce") 

# Outcome event
event_count = (y == 1.00).sum()
print(f"outcome events : {event_count}")

In [None]:
y = y['Mortalité 6 mois'].to_numpy()
y = [int(i) for i in y]

In [None]:
X_volumes_imputed = X_volumes.fillna(0)

In [None]:
print(y["Mortalité 6 mois"].unique())

In [None]:
pipeline_smote_under = Pipeline(steps=[('over', SMOTE()), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])
#pipeline_smote_under = Pipeline(steps=[('over', SMOTENC(categorical_features=["fracas_du_bassin", "amputation"])), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])


inner_cv = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=5, random_state=1)

p_grid = {"model__learning_rate": [0.01, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1], "over__sampling_strategy": [0.1, 0.2, 0.3], "over__k_neighbors":[3,5,8], "under__sampling_strategy":[0.3, 0.5, 0.7]}
clf = GridSearchCV(estimator=pipeline_smote_under, param_grid=p_grid, scoring={'F2':ftwo_scorer}, refit='F2', cv=inner_cv)

outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

nested_scores_smote_undersampling = cross_validate(clf, X_volumes_imputed, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, cv=outer_cv, n_jobs=-1)

print("segmentation volumes: HistGradientBoostingClassifier with hyperparameter gridsearch")

roc_auc_metric = np.mean(nested_scores_smote_undersampling["test_ROC_AUC"])
roc_auc_metric_std = np.std(nested_scores_smote_undersampling["test_ROC_AUC"])
print(f'AUC (max): {np.round(roc_auc_metric, 2)} +- {np.round(roc_auc_metric_std, 2)}')

f1_score = np.mean(nested_scores_smote_undersampling["test_F1"])
f1_score_std = np.std(nested_scores_smote_undersampling["test_F1"])
print(f'F1 Score (max): {np.round(f1_score, 2)} +- {np.round(f1_score_std, 2)}')

f2_score = np.mean(nested_scores_smote_undersampling["test_F2"])
f2_score_std = np.std(nested_scores_smote_undersampling["test_F2"])
print(f'F2 Score (max): {np.round(f2_score, 2)} +- {np.round(f2_score_std, 2)}')

brier_score = -np.mean(nested_scores_smote_undersampling["test_Brier"])
brier_score_std = -np.std(nested_scores_smote_undersampling["test_Brier"])
print(f'Brier Score (min): {np.round(brier_score, 2)} +- {np.round(brier_score_std, 2)}')

# test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
false_neg_score = np.mean(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
false_neg_score_std = np.std(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
print(f'False negative: {int(np.round(false_neg_score, 0))}% +- {int(np.round(false_neg_score_std, 0))}')

false_pos_score = np.mean(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
false_pos_score_std = np.std(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
print(f'False positive: {int(np.round(false_pos_score, 0))}% +- {int(np.round(false_pos_score_std, 0))}')


### TILSUM

In [None]:
X_volumes = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", usecols=range(71,79), skiprows=1)
X_volumes.head()

In [None]:
# Marshal preprocessing
X_volumes['Marshal'] = X_volumes['Marshal'].replace({
    "I diffuse injury (no visible pathology)": 1,
    "II diffuse injury (midline shift <5mm, basal cisterns visible,no high or mixed density lesion > 25 cm3)": 2,
    "III diffuse injury (swelling, midline shift of 0 to 5 mm, basal cisterns compressed or completely effaced, no high or mixed density lesions >25 cm3)": 3,
    "IV diffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "IVdiffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "V evacuated mass lesion (any lesion evacuated surgically)": 5,
    "VI non-evacuated mass lesion (high or mixed density lesions >25 cm3, not surgically evacuated)": 6, 
    "0": 0,  # if"0" is an object
    "1": 1,  
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6
})

# Replace NA or 0 by 1
X_volumes['Marshal'] = X_volumes['Marshal'].apply(lambda x: 1 if pd.isna(x) or x == 0 else x)

print(X_volumes['Marshal'].value_counts())

In [None]:
TIL = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", usecols=range(66,71), skiprows=1)
TIL.head()

In [None]:
# Create y based on the conditions: TIL 2 = 1 or TIL 3 = 1 or TIL 4 = 1
y = pd.DataFrame()
y["y"] = ((TIL.iloc[:, 2] == 1) | (TIL.iloc[:, 3] == 1) | (TIL.iloc[:, 4] == 1)).astype(int)

# Verify the first few rows of y
print(y.head())

# Outcome event
event_count = (y == 1.00).sum()
print(f"outcome events : {event_count}")

In [None]:
# Create y based on the conditions and propagate NA values
y = pd.DataFrame(index=TIL.index)  # Keep the same indexing as TIL

# Apply the conditions, setting NA in y if there are any NA values in the relevant TIL columns
y["y"] = TIL.iloc[:, [0, 1, 2, 3, 4]].apply(
    lambda row: 1 if (row.iloc[2] == 1 or row.iloc[3] == 1 or row.iloc[4] == 1) else 0, axis=1
)

# Set y to NaN if any NA exists in the relevant columns
y.loc[TIL.iloc[:, [0, 1, 2, 3, 4]].isnull().any(axis=1), "y"] = pd.NA

# Verify the first few rows of y
print(y.head())

In [None]:
nan_and_nd_indexes = y.loc[y.isna().any(axis=1)].index  # Get indexes where any NaN exists
print(nan_and_nd_indexes)

y = y.drop(nan_and_nd_indexes)
X_volumes = X_volumes.drop(nan_and_nd_indexes)

In [None]:
X_volumes_imputed = X_volumes.fillna(0)

In [None]:
y = y.to_numpy().ravel()  # Convert y to a 1D array

In [None]:
print(type(y), y.shape)  # Type should be numpy.ndarray and shape should be (n_samples,)

In [None]:
pipeline_smote_under = Pipeline(steps=[('over', SMOTE()), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])
#pipeline_smote_under = Pipeline(steps=[('over', SMOTENC(categorical_features=["fracas_du_bassin", "amputation"])), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])


inner_cv = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=5, random_state=1)

p_grid = {"model__learning_rate": [0.01, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1], "over__sampling_strategy": [0.1, 0.2, 0.3], "over__k_neighbors":[3,5,8], "under__sampling_strategy":[0.3, 0.5, 0.7]}
clf = GridSearchCV(estimator=pipeline_smote_under, param_grid=p_grid, scoring={'F2':ftwo_scorer}, refit='F2', cv=inner_cv)

outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

nested_scores_smote_undersampling = cross_validate(clf, X_volumes_imputed, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, cv=outer_cv, n_jobs=-1)

print("segmentation volumes: HistGradientBoostingClassifier with hyperparameter gridsearch")

roc_auc_metric = np.mean(nested_scores_smote_undersampling["test_ROC_AUC"])
roc_auc_metric_std = np.std(nested_scores_smote_undersampling["test_ROC_AUC"])
print(f'AUC (max): {np.round(roc_auc_metric, 2)} +- {np.round(roc_auc_metric_std, 2)}')

f1_score = np.mean(nested_scores_smote_undersampling["test_F1"])
f1_score_std = np.std(nested_scores_smote_undersampling["test_F1"])
print(f'F1 Score (max): {np.round(f1_score, 2)} +- {np.round(f1_score_std, 2)}')

f2_score = np.mean(nested_scores_smote_undersampling["test_F2"])
f2_score_std = np.std(nested_scores_smote_undersampling["test_F2"])
print(f'F2 Score (max): {np.round(f2_score, 2)} +- {np.round(f2_score_std, 2)}')

brier_score = -np.mean(nested_scores_smote_undersampling["test_Brier"])
brier_score_std = -np.std(nested_scores_smote_undersampling["test_Brier"])
print(f'Brier Score (min): {np.round(brier_score, 2)} +- {np.round(brier_score_std, 2)}')

# test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
false_neg_score = np.mean(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
false_neg_score_std = np.std(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
print(f'False negative: {int(np.round(false_neg_score, 0))}% +- {int(np.round(false_neg_score_std, 0))}')

false_pos_score = np.mean(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
false_pos_score_std = np.std(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
print(f'False positive: {int(np.round(false_pos_score, 0))}% +- {int(np.round(false_pos_score_std, 0))}')


### TIER

In [None]:
X_volumes = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", usecols=range(71,79), skiprows=1)
X_volumes.head()

In [None]:
# Marshal preprocessing
X_volumes['Marshal'] = X_volumes['Marshal'].replace({
    "I diffuse injury (no visible pathology)": 1,
    "II diffuse injury (midline shift <5mm, basal cisterns visible,no high or mixed density lesion > 25 cm3)": 2,
    "III diffuse injury (swelling, midline shift of 0 to 5 mm, basal cisterns compressed or completely effaced, no high or mixed density lesions >25 cm3)": 3,
    "IV diffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "IVdiffuse injury (midline shift >5 mm, no high or mixed density lesions >25 cm3)": 4,
    "V evacuated mass lesion (any lesion evacuated surgically)": 5,
    "VI non-evacuated mass lesion (high or mixed density lesions >25 cm3, not surgically evacuated)": 6, 
    "0": 0,  # if"0" is an object
    "1": 1,  
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6
})

# Replace NA or 0 by 1
X_volumes['Marshal'] = X_volumes['Marshal'].apply(lambda x: 1 if pd.isna(x) or x == 0 else x)

print(X_volumes['Marshal'].value_counts())

In [None]:
TIER = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full_bis.csv", usecols=range(55,65), skiprows=1)
TIER.head()

In [None]:
# Create y based on the conditions and propagate NA values
y = pd.DataFrame(index=TIER.index)  # Keep the same indexing as TIL

# Apply the conditions, setting NA in y if there are any NA values in the relevant TIL columns
y["y"] = TIER.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]].apply(
    lambda row: 1 if (row.iloc[6] == 1 or row.iloc[7] == 1 or row.iloc[8] == 1 or row.iloc[9] == 1) else 0, axis=1
)

# Set y to NaN if any NA exists in the relevant columns
y.loc[TIER.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]].isnull().any(axis=1), "y"] = pd.NA

# Verify the first few rows of y
print(y.head())

In [None]:
nan_and_nd_indexes = y.loc[y.isna().any(axis=1)].index  # Get indexes where any NaN exists
print(nan_and_nd_indexes)

y = y.drop(nan_and_nd_indexes)
X_volumes = X_volumes.drop(nan_and_nd_indexes)

In [None]:
# Outcome event
event_count = (y == 1.00).sum()
print(f"outcome events : {event_count}")

In [None]:
X_volumes_imputed = X_volumes.fillna(0)

In [None]:
y = y.to_numpy().ravel()  # Convert y to a 1D array

In [None]:
print(type(y), y.shape)  # Type should be numpy.ndarray and shape should be (n_samples,)

In [None]:
pipeline_smote_under = Pipeline(steps=[('over', SMOTE()), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])
#pipeline_smote_under = Pipeline(steps=[('over', SMOTENC(categorical_features=["fracas_du_bassin", "amputation"])), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])


inner_cv = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=5, random_state=1)

p_grid = {"model__learning_rate": [0.01, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1], "over__sampling_strategy": [0.1, 0.2, 0.3], "over__k_neighbors":[3,5,8], "under__sampling_strategy":[0.3, 0.5, 0.7]}
clf = GridSearchCV(estimator=pipeline_smote_under, param_grid=p_grid, scoring={'F2':ftwo_scorer}, refit='F2', cv=inner_cv)

outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

nested_scores_smote_undersampling = cross_validate(clf, X_volumes_imputed, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, cv=outer_cv, n_jobs=-1)

print("segmentation volumes: HistGradientBoostingClassifier with hyperparameter gridsearch")

roc_auc_metric = np.mean(nested_scores_smote_undersampling["test_ROC_AUC"])
roc_auc_metric_std = np.std(nested_scores_smote_undersampling["test_ROC_AUC"])
print(f'AUC (max): {np.round(roc_auc_metric, 2)} +- {np.round(roc_auc_metric_std, 2)}')

f1_score = np.mean(nested_scores_smote_undersampling["test_F1"])
f1_score_std = np.std(nested_scores_smote_undersampling["test_F1"])
print(f'F1 Score (max): {np.round(f1_score, 2)} +- {np.round(f1_score_std, 2)}')

f2_score = np.mean(nested_scores_smote_undersampling["test_F2"])
f2_score_std = np.std(nested_scores_smote_undersampling["test_F2"])
print(f'F2 Score (max): {np.round(f2_score, 2)} +- {np.round(f2_score_std, 2)}')

brier_score = -np.mean(nested_scores_smote_undersampling["test_Brier"])
brier_score_std = -np.std(nested_scores_smote_undersampling["test_Brier"])
print(f'Brier Score (min): {np.round(brier_score, 2)} +- {np.round(brier_score_std, 2)}')

# test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
false_neg_score = np.mean(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
false_neg_score_std = np.std(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
print(f'False negative: {int(np.round(false_neg_score, 0))}% +- {int(np.round(false_neg_score_std, 0))}')

false_pos_score = np.mean(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
false_pos_score_std = np.std(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
print(f'False positive: {int(np.round(false_pos_score, 0))}% +- {int(np.round(false_pos_score_std, 0))}')