In [None]:
import pandas as pd
import polars as pl
import numpy as np
import json as json

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION, DATA_FOLD = params['dataset'], params['version'], params['data_folder']

print(f'Working on {DATASET} dataset {VERSION}')

In [None]:
temporal = pd.read_parquet(f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/temporal/treated_all_with_placeholder_values.parquet')
static = pd.read_parquet(f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/static/clean_static_encounters.parquet')

In [None]:
static.columns

In [None]:
static_subset_cols = ['encounterId', 'gender', 'age','poids_admission','taille']


In [None]:
static_subset = static.loc[:,static_subset_cols]
static_subset['imc'] = round(static_subset['poids_admission']/((static_subset['taille']/100)**2), 2)
cond_1 = static_subset['imc'].between(5,100)
static_subset['gender'] = static_subset['gender'] == 'Masculin'
static_subset = static_subset.loc[cond_1,:].drop(columns=['taille', 'poids_admission'])
static_subset['encounterId'] = static_subset['encounterId'].astype('int32')
static_subset

In [None]:
outcome_cols = ['los','deces_datediff','unitLabel' ]
outcomes_subset = static[outcome_cols]
#outcomes_subset['alive_j28'] = outc

In [None]:
temporal_cols = ['encounterId', 'delta_hour','fr', 'pam','heart_rate','spo2', 'temp','nad_dose_poids', 'is_ventilated', 'iv_input', 'fio2_corr', 'urine_rate']
temporal_subset = temporal[temporal_cols]

In [None]:
encounter_null = pl.DataFrame(temporal[temporal_cols]).filter(pl.any_horizontal(temporal_cols).is_null())['encounterId'].unique().to_list()

In [None]:
encounter_null

In [None]:
temporal_subset = temporal_subset[~temporal_subset['encounterId'].isin(encounter_null)]

In [None]:
import pandas as pd
import numpy as np

# Suppose que df est ton DataFrame d'origine
# Et que 'var_cols' contient la liste de tes colonnes de variables (hors 'encounterId', 'delta_hour', etc.)
var_cols = [col for col in temporal_subset.columns if col not in ['encounterId', 'delta_hour', 'last_timestamp']]

# 1. Identifier le dernier timestamp valide pour chaque patient
def get_max_valid_timestamp(group):
    # On filtre les lignes qui ont au moins une variable non nulle
    has_value = group[var_cols].notna().any(axis=1)
    return group.loc[has_value, 'delta_hour'].max()

# Calcul du max delta_hour à conserver pour chaque patient
max_timestamps = temporal_subset.groupby('encounterId').apply(get_max_valid_timestamp).reset_index()
max_timestamps.columns = ['encounterId', 'max_valid_delta_hour']

# 2. Générer toutes les lignes nécessaires
all_rows = []

for _, row in max_timestamps.iterrows():
    encounter_id = row['encounterId']
    max_hour = int(row['max_valid_delta_hour'])
    for h in range(0, max_hour + 1):
        all_rows.append((encounter_id, h))

full_index = pd.DataFrame(all_rows, columns=['encounterId', 'delta_hour'])

# 3. Fusion avec le dataset original
df_filled = pd.merge(full_index, temporal_subset, on=['encounterId', 'delta_hour'], how='left')

In [None]:
df_filled.head()

In [None]:
max_delta = df_filled[['encounterId', 'delta_hour']].groupby('encounterId')[['encounterId', 'delta_hour']].agg('max')
encounter_sup_120 = max_delta[max_delta['delta_hour'] >= 120]['encounterId'].to_list()

In [None]:
temporal_5_days = df_filled[df_filled['delta_hour'].between(0,120) & df_filled['encounterId'].isin(encounter_sup_120)]

In [None]:
columns_to_interpolate = ['fr', 'pam', 'heart_rate', 'spo2', 'temp', 'nad_dose_poids', 'fio2_corr']
column_ffill_bfill = 'is_ventilated'

# Étendre le dataset pour s'assurer que chaque encounterId ait toutes les heures de 0 à 120
def expand_and_impute(group):
    # Créer un index complet de 0 à 120
    full_range = pd.DataFrame({'delta_hour': np.arange(0, 121)})
    group = pd.merge(full_range, group, on='delta_hour', how='left')
    group['encounterId'] = group['encounterId'].ffill().bfill()  # Réassigner l'id
    # Imputer les valeurs numériques par interpolation
    group[columns_to_interpolate] = group[columns_to_interpolate].interpolate(method='linear', limit_direction='both')
    # Pour is_ventilated : ffill puis bfill
    group[column_ffill_bfill] = group[column_ffill_bfill].ffill().bfill()
    return group

# Appliquer par groupe
df_complete = temporal_5_days.groupby('encounterId', group_keys=False).apply(expand_and_impute)

# (Optionnel) réordonner les colonnes
cols = ['encounterId', 'delta_hour'] + columns_to_interpolate + [column_ffill_bfill]
df_complete = df_complete[cols].merge(static_subset, on = 'encounterId', how='inner')
df_complete

In [None]:
df_complete['imc'].max()

In [None]:
outcomes = df_complete[['encounterId', 'delta_hour']].merge(static_subset[''])

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

numpy_2d = df_complete[['fr', 'pam', 'heart_rate', 'spo2', 'temp','nad_dose_poids',  'age', 'imc','gender', 'is_ventilated']].astype('float').to_numpy()



scaler = StandardScaler()
scaler.fit(numpy_2d[:, :8])  # Standardiser les colonnes numériques

numpy_2d[:, :8] = scaler.transform(numpy_2d[:, :8])
numpy_2d[:, 8:] = numpy_2d[:, 8:].astype(int)


numpy_3d = numpy_2d.reshape(-1, 121, 10)

In [None]:
from tslearn.utils import to_time_series_dataset

format_dataset = to_time_series_dataset(numpy_3d)

In [None]:
subset = format_dataset[800:1200,:,:]

In [None]:
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
silhouettes = []
for i in range(2, 6):  # start from 2 clusters
    km = TimeSeriesKMeans(n_clusters=i, metric="dtw")
    labels = km.fit_predict(subset)
    silhouette = silhouette_score(subset, labels, metric="dtw")
    print(silhouette)
    silhouettes.append(silhouette)

In [None]:
silhouettes

In [None]:
from sklearn.decomposition import PCA
from tslearn.clustering import TimeSeriesKMeans

import matplotlib.pyplot as plt

# Utiliser 3 clusters sur le subset
n_clusters = 3
km = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", random_state=0)
labels = km.fit_predict(subset)

# Moyenne sur le temps pour chaque série pour réduire en 2D/3D
X_flat = subset.mean(axis=1)

# Réduction de dimension à 3 composantes principales
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_flat)

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca[:,0], X_pca[:,1], X_pca[:,2], c=labels, cmap='viridis', alpha=0.7)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.title('Visualisation 3D des clusters (k=3)')
plt.show()