# Imports

In [None]:
import polars as pl
import pandas as pd
import numpy as np
import time
import json

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from pypots.imputation import SAITS, BRITS, USGAN, GPVAE
from pygrinder import mcar
from pypots.utils.metrics import calc_mae, calc_rmse

In [None]:
import warnings

warnings.filterwarnings("ignore")

# Configuration

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version']
DATA_FOLD = params['data_folder']

In [None]:
print(DATASET)

In [None]:
fit_model = True

In [None]:
np.random.seed(42)

In [None]:
DATA_FOLDER = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/'
DATA_FILE = 'first_48h_with_static.parquet'
MODEL_FOLDER = f'{DATA_FOLD}/{VERSION}/4.models/imputation/{DATASET}/'
OUTPUT_TABLE = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/tables/'
OUTPUT_DATASET = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/test_datasets/'

# Import des données

In [None]:
first_48h = pl.read_parquet(DATA_FOLDER + DATA_FILE).to_pandas().drop(columns=['total_missing', 'max_valid_interval'])

In [None]:
data = first_48h

In [None]:
data.head()

In [None]:
data.columns

# Création des dataset de train/test/validation

In [None]:
# Répartition des patients pour entraîner/test/validation
patient_ids = data['encounterId'].unique()
train_ids, test_val_ids = train_test_split(patient_ids, test_size=0.3, random_state=42)
test_ids, val_ids = train_test_split(test_val_ids, test_size=0.5, random_state=42)

In [None]:
patient_ids.shape[0]

In [None]:
train_data = data[data['encounterId'].isin(train_ids)]
test_data = data[data['encounterId'].isin(test_ids)]
val_data = data[data['encounterId'].isin(val_ids)]

In [None]:
# Vérification de l'absence de patient sans valeurs

patients_with_missing_vars = (
    test_data.groupby("encounterId")
    .apply(lambda group: group.drop(columns=["encounterId", "intervalle"]).isnull().all(axis=0))
    .any(axis=1)
)

# Filtrer les patients concernés
patients_with_missing_vars = patients_with_missing_vars[patients_with_missing_vars].index.tolist()

len(patients_with_missing_vars)

In [None]:
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)
print("Validation shape:", val_data.shape)

# Création des scénarios de données manquantes

### Données manquantes aléatoires

In [None]:
def random_mask(df, rate=0.3, idx_features = 4):
    df_ori = df.copy()
    df_dyn = df[:,:,:idx_features]
    df_ori_dyn = df_ori[:,:,:idx_features]
    ori_size = df_ori_dyn[~np.isnan(df_ori_dyn)].size

    target_size = ori_size * (1-rate)
    while (df[:,:,:idx_features][~np.isnan(df[:,:,:idx_features])].size  > target_size):
        df[:,:,:idx_features] = mcar(df[:,:,:idx_features], p=rate)
        rate = rate / 2
    return df

### Toutes les données manquantes sur n timestamp consécutifs pour toute ou une partie des variables

In [None]:
list(range(4))

In [None]:

def remove_timestamp(array : np.ndarray, rate : float=0.3, n_timestamp : int=1, n_features : list = list(range(4))) -> np.ndarray:
    n_remove = int((array.shape[0] * array.shape[1] * rate)/n_timestamp)


    for i in range(n_remove) :
        intervalle_index = np.random.randint(0, array.shape[1])
        encounter_index = np.random.randint(0, array.shape[0])
        max_intervalle = intervalle_index+n_timestamp
        if max_intervalle > array.shape[1]-1 :
            max_intervalle = array.shape[1]-1
            
        array[encounter_index,intervalle_index:max_intervalle, n_features ] = np.nan

    return array

# Préparation du jeu de données

In [None]:
def prepared_dataset(df, mask=None, rate : float=0.3, n_timestamp : int=1, n_features : list = list(range(4))) :
    """
    Prépare un dataset en appliquant diverses transformations :
    - Standardisation
    - Reshape en tableau 3D
    - Application de masques spécifiques
    - Vérification des proportions maximales de valeurs manquantes après masquage
    
    Args:
        df (pd.DataFrame): Données d'entrée avec colonnes 'encounterId' et 'intervalle'.
        mask (function): Fonction de masquage à appliquer (facultatif).
        rate (float): Taux minimal de valeurs manquantes autorisé par variable pour un patient.
        n_timestamp (int): Nombre de timestamps à retirer si remove_timestamp est utilisé.
        n_features (list): Liste des features à modifier si remove_timestamp est utilisé.
    
    Returns:
        numpy.ndarray: Tableau 3D transformé et éventuellement masqué.
    """
    n_samples = df['encounterId'].unique().shape[0]
    dropped_df = df.drop(['encounterId','intervalle'], axis=1)
    standardized = StandardScaler().fit_transform(dropped_df.to_numpy())
    reshaped = standardized.reshape(n_samples, 48, -1)

    # Vérification des proportions maximales de valeurs manquantes

    if mask == random_mask:
        masked = random_mask(reshaped, rate)
    elif mask == remove_timestamp :
        masked = remove_timestamp(reshaped, rate, n_timestamp, n_features)
    elif mask == None :
        masked =  reshaped
    else :
        raise ValueError("Aucune correspondance concernant la fonction de masquage.")

    return masked

In [None]:
Train = prepared_dataset(train_data)
Train_mcar = prepared_dataset(train_data, mask=random_mask, rate=0.3)
Val = prepared_dataset(val_data)
Val_mcar = prepared_dataset(val_data, mask=random_mask, rate=0.3)
Test_ori = prepared_dataset(test_data, mask=None)
Test_mcar = prepared_dataset(test_data, mask=random_mask, rate=0.3)
Test_single_row = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3) # missing intervalles
Test_two_rows = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=2)
Test_three_rows = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=3)
Test_hr = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[0])
Test_sp02 = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[1])
Test_fr = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[2])
Test_pa = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[3])


# Définition des méthodes d'imputation

## Pypots models

In [None]:
datasets = {
    "X": Train,
    "missing_mask": Train_mcar,
    "val_data" : {
        "X": Val,
        "missing_mask": Val_mcar
    }
}

In [None]:
n_steps = 48
n_features = 7
device = "cuda"
n_epochs = 50

In [None]:
Train.shape == Train_mcar.shape

### Entrainement SAITS

https://github.com/WenjieDu/SAITS

In [None]:
Train.shape

saits = SAITS(
    n_steps=n_steps, n_features=n_features,
    n_layers=3, d_model=512, d_ffn=128, n_heads=8, d_k=64, d_v=64,
    dropout=0.1,
    epochs=n_epochs,
    device=device,
    saving_path= MODEL_FOLDER + 'saits/model.pth',  # Stratégie de partage entre groupes
    diagonal_attention_mask = True
    )

In [None]:
saits = SAITS(
    n_steps=n_steps, n_features=n_features,
    n_layers=3, d_model=512, d_ffn=128, n_heads=8, d_k=64, d_v=64,
    dropout=0.1,
    epochs=n_epochs,
    device=device,
    saving_path= MODEL_FOLDER + 'saits/model.pth',  # Stratégie de partage entre groupes
    diagonal_attention_mask = True
    )

In [None]:
params = [
    {
    'd_model' : [64,128,256,512], 
     'd_ffn' : [128,256,512,1024], 
     'n_heads' : [2,4,8],
     'd_k' : [32, 64, 128, 256],
     'd_v' : [32, 64, 128, 256],
    'dropout' : [0, 0.1, 0.2, 0.3, 0.4, 0.5]}
]

import torch
import torch.nn as nn

class SAITS(nn.Module):
    def __init__(self, n_steps, n_features, device, n_layers, d_model, d_ffn, n_heads, d_k, d_v, dropout, saving_path, diagonal_attention_mask):
        super(SAITS, self).__init__()
        # Définir les couches du modèle ici
        self.device = device
        self.saving_path = saving_path
        # Exemple de définition de couches
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=d_ffn, dropout=dropout),
            num_layers=n_layers
        )
        self.fc = nn.Linear(d_model, n_features)
        self.to(device)

    def forward(self, x):
        x = self.encoder(x)
        x = self.fc(x)
        return x

    def save(self):
        torch.save(self.state_dict(), self.saving_path)


from sklearn.model_selection import ParameterGrid
import numpy as np
import torch
from sklearn.metrics import mean_squared_error

# Fonction d'évaluation
def evaluate_saits(params, train_loader, val_loader, device):
    model = SAITS(
        n_steps = 48,
        n_features = 6,
        device = "cuda",
        n_layers=3, 
        d_model=params['d_model'], 
        d_ffn=params['d_ffn'], 
        n_heads=params['n_heads'], 
        d_k=int(params['d_model']/params['n_heads']),
        d_v=params['d_v'],
        dropout=params['dropout'],
        saving_path= MODEL_FOLDER + 'saits/model.pth',  # Stratégie de partage entre groupes
        diagonal_attention_mask = True
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])
    criterion = torch.nn.MSELoss()

    # Entraînement du modèle
    for epoch in range(2):
        model.train()
        for x, x_masked in train_loader:  # x = données originales, x_masked = données masquées
            x, x_masked = x.to(device), x_masked.to(device)
            optimizer.zero_grad()
            output = model(x_masked)
            loss = criterion(output, x)  # Comparaison avec les données originales
            loss.backward()
            optimizer.step()

    # Évaluation sur le dataset de validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for x_val, x_val_masked in val_loader:
            x_val, x_val_masked = x_val.to(device), x_val_masked.to(device)
            output = model(x_val_masked)
            loss = criterion(output, x_val)
            val_losses.append(loss.item())

    return np.mean(val_losses)  # Retourne la MSE moyenne

# Définition des hyperparamètres
param_grid =  {
    'd_model' : [64,128,256,512], 
     'd_ffn' : [128,256,512,1024], 
     'n_heads' : [2,4,8],
     'd_k' : [32, 64, 128, 256],
     'd_v' : [32, 64, 128, 256],
    'dropout' : [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "learning_rate" : [0.001]
    }

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


train_loader = list(zip(Train, Train_mcar))
val_loader = list(zip(Val, Val_mcar))

# Recherche des meilleurs hyperparamètres
best_score = float('inf')
best_params = None

for params in ParameterGrid(param_grid):
    score = evaluate_saits(params, train_loader, val_loader, device)
    print(f"Params: {params} - Validation MSE: {score}")
    
    if score < best_score:
        best_score = score
        best_params = params

print("Meilleurs hyperparamètres:", best_params)


In [None]:
model_path_saits = MODEL_FOLDER + "saits/saits_two_days_with_val.pypots"
try :
    saits.load(model_path_saits)
except AssertionError :
    print('model not found')
    pass
except RuntimeError :
    pass
if fit_model :
    saits.fit(datasets)

In [None]:
saits.save(model_path_saits, overwrite=True)

### BRITS

In [None]:
brits = BRITS(
    n_steps=n_steps, 
    n_features=n_features, 
    rnn_hidden_size=128, 
    epochs=n_epochs, 
    device=device,
    saving_path= MODEL_FOLDER + 'brits/model.pth'
    )

In [None]:
model_path_brits = MODEL_FOLDER + "brits/brits_two_days_with_val.pypots"
try :
    brits.load(model_path_brits)
except AssertionError :
    print('model not found')
    pass
except RuntimeError :
    pass
if fit_model :
    brits.fit(datasets)

In [None]:
brits.save(model_path_brits, overwrite=True)

### USGAN

In [None]:
usgan = USGAN(
    n_steps=n_steps, 
    n_features=n_features, 
    epochs=n_epochs, 
    device=device, 
    rnn_hidden_size=128,
    saving_path= MODEL_FOLDER + 'usgan/model.pth'
    )

In [None]:
model_path_usgan = MODEL_FOLDER + "usgan/usgan_two_days_with_val.pypots"
try :
    usgan.load(model_path_usgan)
except AssertionError :
    print('model not found')
    pass
except RuntimeError :
    pass
if fit_model :
    usgan.fit(datasets)

In [None]:
usgan.save(model_path_usgan, overwrite=True)

### GPVAE

In [None]:
gpvae = GPVAE(
    n_steps=n_steps, 
    n_features=n_features, 
    epochs=n_epochs, 
    device=device, 
    latent_size=64,
    saving_path= MODEL_FOLDER + 'gpvae/model.pth'
    )

In [None]:
model_path_gpvae = MODEL_FOLDER + "gpvae/gpvae_two_days_with_val.pypots"
try :
    gpvae.load(model_path_gpvae)
except AssertionError :
    print('model not found')
    pass
except RuntimeError :
    pass
if fit_model :
    gpvae.fit(datasets)

In [None]:
gpvae.save(model_path_gpvae, overwrite=True)

## Forward/Backward Fill

In [None]:
def fill_missing(df):
    """
    Applique un forward fill suivi d'un backward fill sur un tableau 3D numpy.
    
    Args:
        data (numpy.ndarray): Tableau 3D (patients, timestamps, features) contenant des NaN.
    
    Returns:
        numpy.ndarray: Tableau avec les valeurs manquantes complétées.
    """
    filled_data = np.copy(df)
    series_no_values = 0
    # Forward fill
    for patient in range(filled_data.shape[0]):
        for feature in range(filled_data.shape[2]):

            pandas_df = pd.DataFrame(filled_data[patient, : , feature])
            pandas_df = pandas_df.ffill().bfill()
            filled_data[patient, :, feature] = pandas_df.values.flatten()

    
    return filled_data

## Interpolation linéaire

In [None]:
def lin_interpol(df):
    """
    Complète les valeurs manquantes dans un tableau 3D numpy.
    
    1. Impute par la moyenne si des valeurs antérieures et ultérieures existent.
    2. Forward fill si pas de données ultérieures.
    3. Backward fill si pas de données antérieures.
    
    Args:
        data (numpy.ndarray): Tableau 3D (patients, timestamps, features) contenant des NaN.
    
    Returns:
        numpy.ndarray: Tableau avec les valeurs manquantes complétées.
    """
    filled_data = np.copy(df)

    for patient in range(filled_data.shape[0]):
        for feature in range(filled_data.shape[2]):
            series = pd.Series(filled_data[patient, :, feature])

            series.interpolate(method='linear', inplace=True, limit_direction='both')
            # Étape 1 : Imputation par la moyenne (si valeurs antérieures et ultérieures existent)
            """for idx in series[series.isna()].index:
                # Chercher la dernière valeur antérieure
                prev_idx = series[:idx].last_valid_index()
                # Chercher la première valeur ultérieure
                next_idx = series[idx + 1:].first_valid_index()
                
                if prev_idx is not None and next_idx is not None:
                    prev_value = series[prev_idx]
                    next_value = series[next_idx]
                    series.iloc[idx] = (prev_value + next_value) / 2
            """
            # Étape 2 : Forward fill pour les NaN restants (pas de données ultérieures)
            series.ffill(inplace=True)

            # Étape 3 : Backward fill pour les NaN restants (pas de données antérieures)
            series.bfill(inplace=True)

            # Remplacer les données dans le tableau 3D
            filled_data[patient, :, feature] = series.values

    return filled_data


## Imputation par la moyenne/médiane

In [None]:
def impute_with_statistic(df, method="mean"):
    """
    Impute les valeurs manquantes pour chaque feature avec sa moyenne ou sa médiane.

    Args:
        data (numpy.ndarray): Tableau 3D (patients, timestamps, features) contenant des NaN.
        method (str): Méthode d'imputation ("mean" ou "median").

    Returns:
        numpy.ndarray: Tableau avec les valeurs manquantes imputées.
    """
    filled_data = np.copy(df)
    for patient in range(filled_data.shape[0]):
        for feature in range(filled_data.shape[2]):
            if method == "mean":
                filled_value = np.nanmean(filled_data[patient, :, feature])
            elif method == "median":
                filled_value = np.nanmedian(filled_data[patient, :, feature])
            else:
                raise ValueError("Méthode non reconnue. Utilisez 'mean' ou 'median'.")
            
            filled_data[:, :, feature] = np.nan_to_num(filled_data[:, :, feature], nan=filled_value)
            
    return filled_data


https://stackoverflow.com/questions/58613108/imputing-missing-values-using-sklearn-iterativeimputer-class-for-mice  
https://github.com/wendyminai/APPROACHES-TO-MISSING-DATA-IN-TIME-SERIES-

## Imputations 2D

In [None]:
# Flatten training data
n_features = Train.shape[2]
n_timestamps = Train.shape[1]
train_samples = Train.shape[0]

train_flatten = Train.reshape(-1, n_features)

### Imputation par MICE

https://stackoverflow.com/questions/58613108/imputing-missing-values-using-sklearn-iterativeimputer-class-for-mice  
https://github.com/wendyminai/APPROACHES-TO-MISSING-DATA-IN-TIME-SERIES-

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# fit IterativeImputer

imputer_mice = IterativeImputer(max_iter=30, random_state=42)
imputer_mice.fit(train_flatten)


### Imputation par KNNimputer

In [None]:
from sklearn.impute import KNNImputer

In [None]:
imputer_knn = KNNImputer(n_neighbors=2)
imputer_knn.fit(train_flatten)

### Imputation par MissForest

In [None]:
from missforest import MissForest

In [None]:
imputer_mf = MissForest()
imputer_mf.fit(train_flatten)

### Impute 3darray with 2d model

In [None]:
def impute_with_2d_model(df, model):
    n_features = df.shape[2]
    n_timestamps = df.shape[1]
    n_samples = df.shape[0]

    # Flatten data
    flatten = pd.DataFrame(df.reshape(-1, n_features))

    # Impute missing values
    filled_flatten = model.transform(flatten)
    if isinstance(filled_flatten, pd.DataFrame):
        filled_flatten = filled_flatten.to_numpy()
    # Reshape data
    filled_data = filled_flatten.reshape(n_samples, n_timestamps, n_features)

    return filled_data

# Script

### Conditions valeurs manquantes

In [None]:
conditions = [('Random' , Test_mcar), 
              ('Single_row' , Test_single_row), ('Two_rows', Test_two_rows), ('Three_rows', Test_three_rows), ('fr_only',Test_fr),
              ('hr_only',Test_hr), ('pa_only',Test_pa),('spO2_only',Test_sp02)]

In [None]:
condition_test = [('Random' , Test_mcar)]

## Boucles méthodes d'imputation et scénarios

In [None]:
results_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}

### Global

In [None]:
import time

# Test avec SAITS uniquement en NN
time_start = time.time()

for c in conditions:
    time_cond = time.time()
    print(f'----------------------------{c[0]}----------------------------')

    # SAITS
    imputation_saits = saits.impute({'X': c[1]})
    saits_time = time.time() - time_cond
    print(f'Saits time : {saits_time:.3f} sec')

    # MICE
    time_mice_start = time.time()
    imputation_mice = impute_with_2d_model(c[1], imputer_mice)  # Remplace avec ta fonction MICE
    mice_time = time.time() - time_mice_start
    print(f'Mice time : {mice_time:.3f} sec')

    # MF
    time_mf_start = time.time()
    imputation_mf = impute_with_2d_model(c[1], imputer_mf)
    mf_time = time.time() - time_mf_start
    print(f'MF time : {mf_time:.3f} sec')

    # Fill missing
    time_fill_start = time.time()
    imputation_fill = fill_missing(c[1])
    fill_time = time.time() - time_fill_start
    print(f'Fill time : {fill_time:.3f} sec')

    # Linear interpolation
    time_average_start = time.time()
    imputation_average_or_fill = lin_interpol(c[1])
    average_time = time.time() - time_average_start
    print(f'Lin interpol time : {average_time:.3f} sec')

    # Mean imputation
    time_mean_start = time.time()
    imputation_mean = impute_with_statistic(c[1])
    mean_time = time.time() - time_mean_start
    print(f'Mean time : {mean_time:.3f} sec')

    # Median imputation
    time_median_start = time.time()
    imputation_median = impute_with_statistic(c[1], method='median')
    median_time = time.time() - time_median_start
    print(f'Median time : {median_time:.3f} sec')

    # Temps total
    total_time = time.time() - time_start
    print(f'\nTemps total : {total_time:.3f} sec ({time.strftime("%M:%S", time.gmtime(total_time))})')

    imputed_datasets = [
        ('fill', imputation_fill),
        ('mean', imputation_mean),
        ('median', imputation_median),
        ('imputation_average', imputation_average_or_fill),
        ('mice', imputation_mice),
        ('mf', imputation_mf),
        ('saits', imputation_saits),
        #('brits', imputation_brits),
        #('usgan', imputation_usgan),
        #('gpvae', imputation_gpvae)
    ]
    print('imputation done')
    full_imput_time = time.time() - time_cond
    print(f'Imputation time : {full_imput_time}')
    for i in imputed_datasets :
        print(f'---------{i[0]}---------')
        indicating_mask_test = np.isnan(c[1]) ^ np.isnan(Test_ori)
        mae_test = calc_mae(i[1], np.nan_to_num(Test_ori), indicating_mask_test)
        rmse_test = calc_rmse(i[1], np.nan_to_num(Test_ori), indicating_mask_test)
        print(f'{c[0]} imputed with {i[0]} : MAE = {mae_test} / RMSE = {rmse_test}')
        results_mae[i[0]][c[0]] = mae_test
        results_rmse[i[0]][c[0]] = rmse_test
results_df_mae = pd.DataFrame(results_mae)
results_df_rmse = pd.DataFrame(results_rmse)

In [None]:
#  Temps d'imputation de chaque modèle

time_start = time.time()


for c in conditions :
    time_cond = time.time()
    print(f'----------------------------{c[0]}----------------------------')
    imputation_saits = saits.impute({'X':c[1]})
    saits_time = time.time() - time_cond
    print(f'Saits time : {saits_time}')
    imputation_brits = brits.impute({'X':c[1]})
    brits_time = time.time() - saits_time
    print(f'Brits time : {brits_time}')
    imputation_usgan = usgan.impute({'X':c[1]})
    usgan_time = time.time() - brits_time
    print(f'Usgan time : {usgan_time}')
    imputation_gpvae = gpvae.impute({'X':c[1]}).mean(axis=1)
    gpvae_time = time.time() - usgan_time
    print(f'Gpvae time : {gpvae_time}')
    imputation_mice = impute_with_2d_model(c[1], imputer_mice)
    mice_time = time.time() - gpvae_time
    print(f'Mice time : {mice_time}')
    imputation_mf = impute_with_2d_model(c[1], imputer_mf)
    mf_time = time.time() - mice_time
    print(f'Mf time : {mf_time}')
    imputation_fill = fill_missing(c[1])
    fill_time = time.time() - mf_time
    print(f'Fill time : {fill_time}')
    imputation_average_or_fill = lin_interpol(c[1])
    average_time = time.time() - fill_time
    print(f'Average time : {average_time}')
    imputation_mean = impute_with_statistic(c[1])
    mean_time = time.time() - average_time
    print(f'Mean time : {mean_time}')
    imputation_median = impute_with_statistic(c[1], method='median')
    median_time = time.time() - mean_time
    print(f'Median time : {median_time}')
    imputed_datasets = [
        ('fill', imputation_fill),
        ('mean', imputation_mean),
        ('median', imputation_median),
        ('imputation_average', imputation_average_or_fill),
        ('mice', imputation_mice),
        ('mf', imputation_mf),
        ('saits', imputation_saits),
        ('brits', imputation_brits),
        ('usgan', imputation_usgan),
        ('gpvae', imputation_gpvae)
    ]
    print('imputation done')
    full_imput_time = time.time() - time_cond
    print(f'Imputation time : {full_imput_time}')
    for i in imputed_datasets :
        print(f'---------{i[0]}---------')
        indicating_mask_test = np.isnan(c[1]) ^ np.isnan(Test_ori)
        mae_test = calc_mae(i[1], np.nan_to_num(Test_ori), indicating_mask_test)
        rmse_test = calc_rmse(i[1], np.nan_to_num(Test_ori), indicating_mask_test)
        print(f'{c[0]} imputed with {i[0]} : MAE = {mae_test} / RMSE = {rmse_test}')
        results_mae[i[0]][c[0]] = mae_test
        results_rmse[i[0]][c[0]] = rmse_test
results_df_mae = pd.DataFrame(results_mae)
results_df_rmse = pd.DataFrame(results_rmse)

In [None]:
round(results_df_mae.T, 3)

In [None]:
round(results_df_rmse.T, 3)

### Per Feature

In [None]:
results_fr_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_hr_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pam_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pad_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pas_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_sp02_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}

In [None]:
results_fr_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_hr_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pam_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pad_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pas_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_sp02_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}

In [None]:
df_features_mae = [results_hr_mae,results_sp02_mae, results_fr_mae,  results_pad_mae, results_pam_mae, results_pas_mae]

In [None]:
df_features_rmse = [results_hr_rmse,results_sp02_rmse, results_fr_rmse,  results_pad_rmse, results_pam_rmse, results_pas_rmse]

In [None]:

unscaled_df = test_data.drop(['encounterId','intervalle'], axis=1).to_numpy()
scaler= StandardScaler().fit(unscaled_df)

for c in conditions :

    descaled_cond = scaler.inverse_transform(c[1].reshape(-1, n_features))

    print(f'----------------------------{c[0]}----------------------------')
    imputation_saits = saits.impute({'X':c[1]})

    imputation_brits = brits.impute({'X':c[1]})

    imputation_usgan = usgan.impute({'X':c[1]})

    imputation_gpvae = gpvae.impute({'X':c[1]}).mean(axis=1)

    imputation_mice = impute_with_2d_model(c[1], imputer_mice)
 
    imputation_mf = impute_with_2d_model(c[1], imputer_mf)

    imputation_fill = fill_missing(c[1])


    imputation_average_or_fill = lin_interpol(c[1])

 
    imputation_mean = impute_with_statistic(c[1])

    imputation_median = impute_with_statistic(c[1], method='median')


    imputed_datasets = [
        ('fill', imputation_fill),
        ('mean', imputation_mean),
        ('median', imputation_median),
        ('imputation_average', imputation_average_or_fill),
        ('mice', imputation_mice),
        ('mf', imputation_mf),
        ('saits', imputation_saits),
        ('brits', imputation_brits),
        ('usgan', imputation_usgan),
        ('gpvae', imputation_gpvae)
    ]
    print('imputation done')


    for idx, feat in enumerate(df_features_mae) :

        for i in imputed_datasets :
            imputed_descaled = scaler.inverse_transform(i[1].reshape(-1, n_features))
    
            indicating_mask_test = np.isnan(descaled_cond[:,idx]) ^ np.isnan(unscaled_df[:,idx])
            mae_test = calc_mae(imputed_descaled[:,idx], np.nan_to_num(unscaled_df[:,idx]), indicating_mask_test)
            rmse_test = calc_rmse(imputed_descaled[:,idx], np.nan_to_num(unscaled_df[:,idx]), indicating_mask_test)

            df_features_mae[idx][i[0]][c[0]] = mae_test
            df_features_rmse[idx][i[0]][c[0]] = rmse_test


### Save results

In [None]:
feature_index = ['heart_rate', 'spo2', 'fr', 'pad', 'pam', 'pas']
for idx, i in enumerate(feature_index) :
    print(i)
    pd.DataFrame(df_features_mae[idx]).to_excel(OUTPUT_TABLE + f'mae_per_feature/feature_{i}_mae.xlsx')
    pd.DataFrame(df_features_rmse[idx]).to_excel(OUTPUT_TABLE + f'rmse_per_feature/feature_{i}_rmse.xlsx')


In [None]:
results_df_mae.T.to_excel(OUTPUT_TABLE + 'results_global_imputation_mae.xlsx')
results_df_rmse.T.to_excel(OUTPUT_TABLE + 'results_global_imputation_rmse.xlsx')

# Impute Dataset (SAITS)

In [None]:
data_id_intervalle = data[['encounterId', 'intervalle']]
data_features = data.drop(columns=data_id_intervalle.columns)

In [None]:
data_features

In [None]:
def reshaped_inverse_scaler(original_dataset, imputed_dataset):


    data_id_intervalle = original_dataset[['encounterId', 'intervalle']]
    data_features = original_dataset.drop(columns=data_id_intervalle.columns)
    scaler = StandardScaler().fit(data_features.to_numpy())
    data_imputed_reshaped = pd.DataFrame(scaler.inverse_transform(imputed_dataset.reshape(-1,6)), columns=data_features.columns)
    
    return data_imputed_reshaped

In [None]:
n_samples = int(data.shape[0]/48)
scaler = StandardScaler().fit(data_features.to_numpy())
data_transformed = scaler.transform(data_features)
data_reshaped = data_transformed.reshape(n_samples, 48, -1)

In [None]:
data_imputed = saits.impute({'X':data_reshaped})

In [None]:
data_imputed.shape

In [None]:
data_imputed_reshaped = pd.DataFrame(scaler.inverse_transform(data_imputed.reshape(-1,7)), columns=data_features.columns)

In [None]:
data_imputed_reshaped['encounterId'] = data_id_intervalle['encounterId']
data_imputed_reshaped['intervalle'] = data_id_intervalle['intervalle']

In [None]:
data_imputed_reshaped = data_imputed_reshaped[data.columns]

In [None]:
data_imputed_reshaped['gender'].value_counts()

In [None]:
data_imputed_reshaped[data_imputed_reshaped['intervalle'].isna()]

In [None]:
data_imputed_reshaped.to_parquet(OUTPUT_TABLE + 'first_48_with_static_imputed_saits.parquet')

# Analyse de la répétabilité

In [None]:
from pathlib import Path
from datetime import datetime

# Prétraitement des données
unscaled_df = test_data.drop(['encounterId', 'intervalle'], axis=1).to_numpy()
scaler = StandardScaler().fit(unscaled_df)

test_data.to_parquet(OUTPUT_DATASET + 'original.parquet')

# Scénarios à tester
filtered_conditions = [
    ('Random', Test_mcar),
    ('Single_row', Test_single_row),
    ('Two_rows', Test_two_rows),
    ('Three_rows', Test_three_rows),
    ('pa_only', Test_pa)
]

datasets = {'lin_interpol': {}, 'saits': {}, 'mean': {}}
idx_list = [(0, 'hr'), (1, 'spo2'), (2, 'fr'), (4, 'pam')]

for scenario_name, scenario_data in filtered_conditions:
    print(f'Processing scenario: {scenario_name}')
    descaled_cond = scaler.inverse_transform(scenario_data.reshape(-1, n_features))

    imputations = {
        'saits': saits.impute({'X': scenario_data}),
        'lin_interpol': lin_interpol(scenario_data),
        'mean': impute_with_statistic(scenario_data)
    }
    print('Imputation done')

    imputations_descaled = {
        'saits' : scaler.inverse_transform(imputations['saits'].reshape(-1, n_features)),
        'lin_interpol' : scaler.inverse_transform(imputations['lin_interpol'].reshape(-1, n_features)),
        'mean' : scaler.inverse_transform(imputations['mean'].reshape(-1, n_features))
    }

    for method, df in imputations_descaled.items() :
        df_parquet = pd.DataFrame(df, columns=['heart_rate', 'spo2', 'fr', 'pam',
       'gender', 'age', 'admission_type'])
        df_parquet['encounterId'] = test_data['encounterId']
        df_parquet['intervalle'] = test_data['intervalle']
        df_parquet.to_parquet(OUTPUT_DATASET + f'{scenario_name}_imputed_{method}.parquet')

    for idx, feature_name in idx_list:
        print(f'Processing feature: {feature_name}')
        indicating_mask_test = np.isnan(descaled_cond[:, idx]) ^ np.isnan(unscaled_df[:, idx])
        original_values = unscaled_df[:, idx][indicating_mask_test]

        for method, imputed_data in imputations_descaled.items():
            imputed_values = imputed_data[:, idx][indicating_mask_test]
            
            df = pd.DataFrame({
                'masquées': original_values,
                'imputées': imputed_values
            })
        
            df['moyenne'] = (df['masquées'] + df['imputées']) / 2
            df['différence'] = df['imputées'] - df['masquées']

                
            output_dir = Path(OUTPUT_TABLE) / f'{feature_name}_comparaison'
            output_dir.mkdir(parents=True, exist_ok=True)
                

            file_name = f'{scenario_name}_{method}.xlsx'
            df.to_excel(output_dir / file_name, index=False)

print('Processing complete!')


In [None]:
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Prétraitement des données
unscaled_df = test_data.drop(['encounterId', 'intervalle'], axis=1).to_numpy()
scaler = StandardScaler().fit(unscaled_df)

test_data.to_parquet(OUTPUT_DATASET + 'original.parquet')

# Scénarios à tester
filtered_conditions = [
    ('Random', Test_mcar),
    ('Single_row', Test_single_row),
    ('Two_rows', Test_two_rows),
    ('Three_rows', Test_three_rows),
    ('pa_only', Test_pa)
]

datasets = {'lin_interpol': {}, 'saits': {}, 'mean': {}}
idx_list = [(0, 'hr'), (1, 'spo2'), (2, 'fr'), (4, 'pam')]

# Progression des scénarios
for scenario_name, scenario_data in tqdm(filtered_conditions, desc="Processing Scenarios"):
    tqdm.write(f'→ Scénario en cours : {scenario_name}')
    descaled_cond = scaler.inverse_transform(scenario_data.reshape(-1, n_features))

    imputations = {
        'saits': saits.impute({'X': scenario_data}),
        'lin_interpol': lin_interpol(scenario_data),
        'mean': impute_with_statistic(scenario_data)
    }
    
    tqdm.write(f'✔ Imputation terminée pour {scenario_name}')

    imputations_descaled = {
        'saits' : scaler.inverse_transform(imputations['saits'].reshape(-1, n_features)),
        'lin_interpol' : scaler.inverse_transform(imputations['lin_interpol'].reshape(-1, n_features)),
        'mean' : scaler.inverse_transform(imputations['mean'].reshape(-1, n_features))
    }

    for method, df in imputations_descaled.items():
        df_parquet = pd.DataFrame(df, columns=['heart_rate', 'spo2', 'fr', 'pam','gender', 'age', 'admission_type'])
        df_parquet['encounterId'] = test_data['encounterId']
        df_parquet['intervalle'] = test_data['intervalle']
        df_parquet.to_parquet(OUTPUT_DATASET + f'{scenario_name}_imputed_{method}.parquet')

    # Progression des features
    for idx, feature_name in tqdm(idx_list, desc=f"Processing Features for {scenario_name}", leave=False):
        indicating_mask_test = np.isnan(descaled_cond[:, idx]) ^ np.isnan(unscaled_df[:, idx])
        original_values = unscaled_df[:, idx][indicating_mask_test]

        for method, imputed_data in imputations_descaled.items():
            imputed_values = imputed_data[:, idx][indicating_mask_test]
            
            df = pd.DataFrame({
                'masquées': original_values,
                'imputées': imputed_values
            })
        
            df['moyenne'] = (df['masquées'] + df['imputées']) / 2
            df['différence'] = df['imputées'] - df['masquées']

            output_dir = Path(OUTPUT_TABLE) / f'{feature_name}_comparaison'
            output_dir.mkdir(parents=True, exist_ok=True)

            file_name = f'{scenario_name}_{method}.xlsx'
            df.to_excel(output_dir / file_name, index=False)

tqdm.write('✔ Traitement complet ! 🚀')
