### Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler

%matplotlib inline

### Utility

In [None]:
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix

ftwo_scorer = make_scorer(fbeta_score, beta=2)

def confusion_matrix_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)

     return {'tn': cm[0, 0], 'fp': cm[0, 1],
             'fn': cm[1, 0], 'tp': cm[1, 1]}

def false_neg_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)
     
     return cm[1, 0]

def false_pos_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)
     
     return cm[0, 1]

### Loading and preparing the dataset

In [None]:
DATA_DIRECTORY = "data/"

In [None]:
cleaned_data_full = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full.csv", skiprows=1)
#cleaned_data_full = pd.read_csv(DATA_DIRECTORY+"clinical_data_anonymized.csv", skiprows=1)
cleaned_data_full.head()

In [None]:
# Exclude rows where the 'exclusion' column is not null
cleaned_data_full = cleaned_data_full[cleaned_data_full['Exclusion'].isnull()]

In [None]:
# Select only columns 16 to 36
X_prehosp = cleaned_data_full.iloc[:, 16:36]
X_prehosp.head()

In [None]:
# Convert all variables in X_prehosp to numeric, coercing invalid entries to NaN
X_prehosp_numeric = X_prehosp.apply(pd.to_numeric, errors="coerce")

# Count missing values (NA) for each variable in X_prehosp
na_counts = X_prehosp_numeric.isna().sum()

# Get unique values for each variable in X_prehosp to check for potential outliers
unique_values = {col: X_prehosp_numeric[col].unique() for col in X_prehosp_numeric.columns}

# Calculate min and max for each variable in X_prehosp_numeric
min_values = X_prehosp_numeric.min()
max_values = X_prehosp_numeric.max()

# Create the summary DataFrame with min, max, missing values, and unique values
summary = pd.DataFrame({
    "Variable": X_prehosp_numeric.columns,
    "Missing Values": na_counts,
    "Unique Values": [list(unique_values[col]) for col in X_prehosp_numeric.columns],
    "Min Value": min_values,
    "Max Value": max_values
})

# Display the summary
print(summary)

In [None]:
print(X_prehosp_numeric.columns.tolist())

In [None]:
# Définir les limites maximales pour les colonnes
capping_limits = {
    "Shock Index SMUR": 3,
    "GCS SMUR ": 15,
    "GCS (M) SMUR ": 6,
    "Shock Index inversé": 3,
    "Shock index diastolique": 3,
    "Amputation": 1,
    "ACR SMUR": 1,
    "Hémorragie ext SMUR": 1,
    "Ischémie": 1,
    "Intubation prehosp": 1,
    "OsmoTH prehosp": 1,
    "Vasopresseur prehosp": 1
}

# Appliquer le capping
for column, max_value in capping_limits.items():
    if column in X_prehosp_numeric.columns:
        X_prehosp_numeric[column] = X_prehosp_numeric[column].clip(upper=max_value)
    else:
        print(f"Warning: Column '{column}' not found in DataFrame.")

In [None]:
# Définir les stratégies d’imputation
imputation_strategies = {
    "PAS  SMUR ": "median",
    "PAD  SMUR ": "median",
    "FC SMUR ": "median",
    "Shock Index SMUR": "median",
    "GCS SMUR ": "median",
    "GCS (M) SMUR ": "median",
    "Shock Index inversé": "median",
    "Shock index diastolique": "median",
    "Anomalie pupille SMUR": 0,
    "Fracas bassin": 0,
    "Amputation": 0,
    "ACR SMUR": 0,
    "Hémorragie ext SMUR": 0,
    "Ischémie": 0,
    "Intubation prehosp": 0,
    "Expansion volémique": "median",
    "OsmoTH prehosp": 0,
    "Vasopresseur prehosp": 0
}

# Appliquer l’imputation
for column, strategy in imputation_strategies.items():
    if column in X_prehosp_numeric.columns:
        if strategy == "median":
            X_prehosp_numeric[column] = X_prehosp_numeric[column].fillna(X_prehosp_numeric[column].median())
        else:
            X_prehosp_numeric[column] = X_prehosp_numeric[column].fillna(strategy)
    else:
        print(f"Warning: Column '{column}' not found in DataFrame.")

# Vérifiez les résultats de l’imputation
print("Imputation terminée.")


In [None]:
na_counts = X_prehosp_numeric.isna().sum()
print(na_counts)

In [None]:
# Colonnes à supprimer
columns_to_drop = ["FR SMUR", "Hémocue SMUR "]

# Suppression si elles existent dans le DataFrame
columns_existing = [col for col in columns_to_drop if col in X_prehosp_numeric.columns]
X_prehosp_numeric = X_prehosp_numeric.drop(columns=columns_existing)

### TILSUM

In [None]:
TIL = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full.csv", usecols=range(71,76))
TIL.head()

In [None]:
# Create y based on the conditions: TIL 2 = 1 or TIL 3 = 1 or TIL 4 = 1
y = pd.DataFrame()
y["y"] = ((TIL.iloc[:, 2] == 1) | (TIL.iloc[:, 3] == 1) | (TIL.iloc[:, 4] == 1)).astype(int)

# Verify the first few rows of y
print(y.head())

# Outcome event
event_count = (y == 1.00).sum()
print(f"outcome events : {event_count}")

In [None]:
# Create y based on the conditions and propagate NA values
y = pd.DataFrame(index=TIL.index)  # Keep the same indexing as TIL

# Apply the conditions, setting NA in y if there are any NA values in the relevant TIL columns
y["y"] = TIL.iloc[:, [0, 1, 2, 3, 4]].apply(
    lambda row: 1 if (row.iloc[2] == 1 or row.iloc[3] == 1 or row.iloc[4] == 1) else 0, axis=1
)

# Set y to NaN if any NA exists in the relevant columns
y.loc[TIL.iloc[:, [0, 1, 2, 3, 4]].isnull().any(axis=1), "y"] = pd.NA

# Verify the first few rows of y
print(y.head())

In [None]:
# Align indexes between y and X_prehosp_numeric
X_prehosp_numeric, y = X_prehosp_numeric.align(y, join="inner", axis=0)

# Identify rows where any NaN exists in y
nan_and_nd_indexes = y.loc[y.isna().any(axis=1)].index

# Drop rows with NaN from both y and X_prehosp_numeric
y = y.drop(index=nan_and_nd_indexes)
X_prehosp_numeric = X_prehosp_numeric.drop(index=nan_and_nd_indexes)

# Check if the number of rows matches
assert X_prehosp_numeric.shape[0] == y.shape[0], "Number of rows in X and y do not match!"

print(f"Number of rows after cleaning: {X_prehosp_numeric.shape[0]}")


In [None]:
y = y.to_numpy().ravel()  # Convert y to a 1D array

In [None]:
print(type(y), y.shape)  # Type should be numpy.ndarray and shape should be (n_samples,)

In [None]:
FOLDS = 5
N_REPEATS = 3
nb_total_samples = len(y)

In [None]:
pipeline_smote_under = Pipeline(steps=[('over', SMOTE()), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])
#pipeline_smote_under = Pipeline(steps=[('over', SMOTENC(categorical_features=["fracas_du_bassin", "amputation"])), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])


inner_cv = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=5, random_state=1)

p_grid = {"model__learning_rate": [0.01, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1], "over__sampling_strategy": [0.1, 0.2, 0.3], "over__k_neighbors":[3,5,8], "under__sampling_strategy":[0.3, 0.5, 0.7]}
clf = GridSearchCV(estimator=pipeline_smote_under, param_grid=p_grid, scoring={'F2':ftwo_scorer}, refit='F2', cv=inner_cv)

outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

nested_scores_smote_undersampling = cross_validate(clf, X_prehosp_numeric, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, 
                                                   cv=outer_cv, n_jobs=-1, return_estimator=True, return_indices=True)

print("segmentation volumes: HistGradientBoostingClassifier with hyperparameter gridsearch")

roc_auc_metric = np.mean(nested_scores_smote_undersampling["test_ROC_AUC"])
roc_auc_metric_std = np.std(nested_scores_smote_undersampling["test_ROC_AUC"])
print(f'AUC (max): {np.round(roc_auc_metric, 2)} +- {np.round(roc_auc_metric_std, 2)}')

f1_score = np.mean(nested_scores_smote_undersampling["test_F1"])
f1_score_std = np.std(nested_scores_smote_undersampling["test_F1"])
print(f'F1 Score (max): {np.round(f1_score, 2)} +- {np.round(f1_score_std, 2)}')

f2_score = np.mean(nested_scores_smote_undersampling["test_F2"])
f2_score_std = np.std(nested_scores_smote_undersampling["test_F2"])
print(f'F2 Score (max): {np.round(f2_score, 2)} +- {np.round(f2_score_std, 2)}')

brier_score = -np.mean(nested_scores_smote_undersampling["test_Brier"])
brier_score_std = -np.std(nested_scores_smote_undersampling["test_Brier"])
print(f'Brier Score (min): {np.round(brier_score, 2)} +- {np.round(brier_score_std, 2)}')

# test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
false_neg_score = np.mean(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
false_neg_score_std = np.std(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
print(f'False negative: {int(np.round(false_neg_score, 0))}% +- {int(np.round(false_neg_score_std, 0))}')

false_pos_score = np.mean(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
false_pos_score_std = np.std(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
print(f'False positive: {int(np.round(false_pos_score, 0))}% +- {int(np.round(false_pos_score_std, 0))}')


In [None]:
fold = 0

In [None]:
X_prehosp_numeric

In [None]:
nested_scores_smote_undersampling["indices"]["train"][fold]

In [None]:
good_train_indices = X_prehosp_numeric.index.intersection(nested_scores_smote_undersampling["indices"]["train"][fold])

X_prehosp_numeric.loc[good_train_indices]

In [None]:
y_true_train = np.asarray(y)[good_train_indices]
y_true_test = np.asarray(y)[nested_scores_smote_undersampling["indices"]["test"][fold]]

fitted = nested_scores_smote_undersampling["estimator"][0].fit(X_prehosp_numeric.loc[good_train_indices], y_true_train) #-#=#


In [None]:
import time

times = []
for n in range(100):
    for i in nested_scores_smote_undersampling["indices"]["test"][fold]:
        try:
            start_time = time.time()
            new_pred_test = fitted.predict(X_prehosp_numeric.loc[[i]])
            end_time = time.time()
            times.append(end_time-start_time)
        except:
            pass
        


print(f"average time for predictions: ")
print(f"{np.mean(times)} seconds")
print(np.std(times))

### TIER

In [None]:
TIER = pd.read_csv(DATA_DIRECTORY+"cleaned_data_full.csv", usecols=range(60,70), skiprows=1)
TIER.head()

In [None]:
# Create y based on the conditions and propagate NA values
y = pd.DataFrame(index=TIER.index)  # Keep the same indexing as TIL

# Apply the conditions, setting NA in y if there are any NA values in the relevant TIL columns
y["y"] = TIER.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]].apply(
    lambda row: 1 if (row.iloc[6] == 1 or row.iloc[7] == 1 or row.iloc[8] == 1 or row.iloc[9] == 1) else 0, axis=1
)

# Set y to NaN if any NA exists in the relevant columns
y.loc[TIER.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]].isnull().any(axis=1), "y"] = pd.NA

# Verify the first few rows of y
print(y.head())

In [None]:
# Outcome event
event_count = (y == 1.00).sum()
print(f"outcome events : {event_count}")

In [None]:
# Align indexes between y and X_prehosp_numeric
X_prehosp_numeric, y = X_prehosp_numeric.align(y, join="inner", axis=0)

# Identify rows where any NaN exists in y
nan_and_nd_indexes = y.loc[y.isna().any(axis=1)].index

# Drop rows with NaN from both y and X_prehosp_numeric
y = y.drop(index=nan_and_nd_indexes)
X_prehosp_numeric = X_prehosp_numeric.drop(index=nan_and_nd_indexes)

# Check if the number of rows matches
assert X_prehosp_numeric.shape[0] == y.shape[0], "Number of rows in X and y do not match!"

print(f"Number of rows after cleaning: {X_prehosp_numeric.shape[0]}")

y = y.to_numpy().ravel()  # Convert y to a 1D array

print(type(y), y.shape)  # Type should be numpy.ndarray and shape should be (n_samples,)

FOLDS = 5
N_REPEATS = 3
nb_total_samples = len(y)

In [None]:
pipeline_smote_under = Pipeline(steps=[('over', SMOTE()), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])
#pipeline_smote_under = Pipeline(steps=[('over', SMOTENC(categorical_features=["fracas_du_bassin", "amputation"])), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', HistGradientBoostingClassifier())])


inner_cv = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=5, random_state=1)

p_grid = {"model__learning_rate": [0.01, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1], "over__sampling_strategy": [0.1, 0.2, 0.3], "over__k_neighbors":[3,5,8], "under__sampling_strategy":[0.3, 0.5, 0.7]}
clf = GridSearchCV(estimator=pipeline_smote_under, param_grid=p_grid, scoring={'F2':ftwo_scorer}, refit='F2', cv=inner_cv)

outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

nested_scores_smote_undersampling = cross_validate(clf, X_prehosp_numeric, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, cv=outer_cv, n_jobs=-1)

print("segmentation volumes: HistGradientBoostingClassifier with hyperparameter gridsearch")

roc_auc_metric = np.mean(nested_scores_smote_undersampling["test_ROC_AUC"])
roc_auc_metric_std = np.std(nested_scores_smote_undersampling["test_ROC_AUC"])
print(f'AUC (max): {np.round(roc_auc_metric, 2)} +- {np.round(roc_auc_metric_std, 2)}')

f1_score = np.mean(nested_scores_smote_undersampling["test_F1"])
f1_score_std = np.std(nested_scores_smote_undersampling["test_F1"])
print(f'F1 Score (max): {np.round(f1_score, 2)} +- {np.round(f1_score_std, 2)}')

f2_score = np.mean(nested_scores_smote_undersampling["test_F2"])
f2_score_std = np.std(nested_scores_smote_undersampling["test_F2"])
print(f'F2 Score (max): {np.round(f2_score, 2)} +- {np.round(f2_score_std, 2)}')

brier_score = -np.mean(nested_scores_smote_undersampling["test_Brier"])
brier_score_std = -np.std(nested_scores_smote_undersampling["test_Brier"])
print(f'Brier Score (min): {np.round(brier_score, 2)} +- {np.round(brier_score_std, 2)}')

# test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
false_neg_score = np.mean(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
false_neg_score_std = np.std(nested_scores_smote_undersampling["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS) 
print(f'False negative: {int(np.round(false_neg_score, 0))}% +- {int(np.round(false_neg_score_std, 0))}')

false_pos_score = np.mean(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
false_pos_score_std = np.std(nested_scores_smote_undersampling["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
print(f'False positive: {int(np.round(false_pos_score, 0))}% +- {int(np.round(false_pos_score_std, 0))}')