In [None]:
import polars as pl
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.calibration import calibration_curve
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import joblib

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version']
DATA_FOLD = params['data_folder']

In [None]:
static = pl.read_parquet(f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/static/clean_static_encounters.parquet')
temporal = pl.read_parquet(f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/temporal/all_features_with_delta.parquet')

In [None]:
static['encounterNumber']

In [None]:
df = pd.read_parquet((f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/static/minimal_grouped_24h.parquet'))

In [None]:
df.columns

In [None]:
scaler = StandardScaler()
encoder = OrdinalEncoder()



categorical_data = [ 'mode_vent',
 'ecmo_type',
 'neuro_status',
 'gender',
 'admission_type']
numerical_data = ['age','poids_suivi_mean','spo2_mean', 'temp_mean', 'pas_mean', 'pad_mean', 'pam_mean', 'fr_mean', 'heart_rate_mean',
 'dobu_dose_poids_mean', 'nad_dose_poids_mean', 'glyc_cap_mean', 'tp_mean', 'creat_mean', 'bili_tot_mean', 'num_plq_mean', 'leucocytes_mean', 'lactate_mean', 'hemoglobine_mean',
 'fio2_corr_mean', 
 'urine_rate_mean','spo2_std', 'temp_std', 'pas_std', 'pad_std', 'pam_std', 'fr_std', 'heart_rate_std', 'dobu_dose_poids_std', 'nad_dose_poids_std', 'glyc_cap_std',
 'tp_std', 'creat_std', 'bili_tot_std', 'num_plq_std', 'leucocytes_std', 'lactate_std', 'hemoglobine_std', 'fio2_corr_std', 'urine_rate_std',  'poids_suivi_std',
 'poids_suivi_mean', 'iv_input']

df[['cgr']] = df[['cgr']] > 3
df[['plq']] = df[['plq']] > 0
df[['pfc']] = df[['pfc']] > 0

drop = [ 'is_ventilated_std', 'is_prone_std']

df_prepared = df.drop(columns=drop)

df_prepared[['cgr']] = (df[['cgr']] > 3).astype(int)
df_prepared[['plq']] = (df[['plq']] > 0).astype(int)
df_prepared[['pfc']] = (df[['pfc']] > 0).astype(int)


df_prepared['inf_j30'] = df_prepared['inf_j30'].astype(int)

for col in categorical_data :
    df_prepared[[col]] = encoder.fit_transform(df_prepared[[col]])

for col in numerical_data :
    df_prepared[[col]] = scaler.fit_transform(df_prepared[[col]])

In [None]:
df_prepared.dtypes

In [None]:
df_prepared['inf_j30'].value_counts()

In [None]:
no_encounter = df_prepared.iloc[:,2:].astype(float)

In [None]:
no_encounter.astype(float)

In [None]:

# ----- FEATURES / TARGET -----
target_col = 'inf_j30'
X = no_encounter.drop(columns=['inf_j30', 'los'])
y = no_encounter[target_col]

# ----- VIF REDUCTION -----
X = X.astype(float)  # Ensure all columns are float for VIF compatibility
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_values = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data["VIF"] = vif_values
features_to_keep = vif_data.loc[vif_data["VIF"] < 10, "feature"]
X = X[features_to_keep]

# ----- SPLIT DATA -----
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

# ----- SCALING -----
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# ----- MODELS -----
models = {
    'log_reg': LogisticRegression(class_weight='balanced', max_iter=1000),
    'random_forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'xgboost': XGBClassifier(scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), use_label_encoder=False, eval_metric='logloss')
}

params = {
    'log_reg': {'C': [0.01, 0.1, 1, 10]},
    'random_forest': {'n_estimators': [100, 200], 'max_depth': [5, 10]},
    'xgboost': {'n_estimators': [100, 200], 'max_depth': [3, 6]}
}

best_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    grid = GridSearchCV(model, params[name], cv=StratifiedKFold(n_splits=5), scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train_scaled, y_train)
    best_models[name] = grid.best_estimator_
    print(f"Best AUC on validation ({name}):", roc_auc_score(y_val, grid.predict_proba(X_val_scaled)[:, 1]))
    joblib.dump(grid.best_estimator_, f"best_model_{name}.joblib")

# ----- HOSMER-LEMESHOW PLOT -----
def plot_hosmer_lemeshow(y_true, y_probs, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_probs, n_bins=n_bins)
    plt.figure()
    plt.plot(prob_pred, prob_true, marker='o')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.title('Hosmer-Lemeshow Calibration')
    plt.xlabel('Predicted probability')
    plt.ylabel('True probability')
    plt.grid()
    plt.show()

# Example: plot HL for best logistic regression
probs = best_models['log_reg'].predict_proba(X_val_scaled)[:, 1]
plot_hosmer_lemeshow(y_val, probs)

# Save scaler too
joblib.dump(scaler, "scaler.joblib")