# Imports

In [None]:
import polars as pl
import pandas as pd
import numpy as np
import time
import json

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from pypots.imputation import SAITS, BRITS, USGAN, GPVAE
from pygrinder import mcar
from pypots.utils.metrics import calc_mae, calc_rmse

In [None]:
import warnings

warnings.filterwarnings("ignore")

# Configuration

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version']
DATA_FOLD = params['data_folder']

In [None]:
print(DATASET)

In [None]:
fit_model = False

In [None]:
np.random.seed(42)

In [None]:
DATA_FOLDER = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/'
DATA_FILE = 'first_48h.parquet'
MODEL_FOLDER = f'{DATA_FOLD}/{VERSION}/4.models/imputation/{DATASET}/'
OUTPUT_TABLE = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/tables/'
OUTPUT_DATASET = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/test_datasets/'

# Import des données

In [None]:
first_48h = pl.read_parquet(DATA_FOLDER + DATA_FILE).to_pandas().drop(columns=['total_missing', 'max_valid_interval', '__index_level_0__'])

In [None]:
data = first_48h

In [None]:
data.head()

# Création des dataset de train/test/validation

In [None]:
# Répartition des patients pour entraîner/test/validation
patient_ids = data['encounterId'].unique()
train_ids, test_val_ids = train_test_split(patient_ids, test_size=0.3, random_state=42)
test_ids, val_ids = train_test_split(test_val_ids, test_size=0.5, random_state=42)

In [None]:
patient_ids.shape[0]

In [None]:
train_data = data[data['encounterId'].isin(train_ids)]
test_data = data[data['encounterId'].isin(test_ids)]
val_data = data[data['encounterId'].isin(val_ids)]

In [None]:
# Vérification de l'absence de patient sans valeurs

patients_with_missing_vars = (
    test_data.groupby("encounterId")
    .apply(lambda group: group.drop(columns=["encounterId", "intervalle"]).isnull().all(axis=0))
    .any(axis=1)
)

# Filtrer les patients concernés
patients_with_missing_vars = patients_with_missing_vars[patients_with_missing_vars].index.tolist()

len(patients_with_missing_vars)

In [None]:
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)
print("Validation shape:", val_data.shape)

# Création des scénarios de données manquantes

### Données manquantes aléatoires

In [None]:
def random_mask(df, rate=0.3):
    df_ori = df.copy()
    ori_size = df_ori[~np.isnan(df_ori)].size
    i=0
    target_size = ori_size * (1-rate)
    while (df[~np.isnan(df)].size  > target_size):
        df = mcar(df, p=rate)
        rate = rate / 2
    return df

### Toutes les données manquantes sur n timestamp consécutifs pour toute ou une partie des variables

In [None]:

def remove_timestamp(array : np.ndarray, rate : float=0.3, n_timestamp : int=1, n_features : list = None) -> np.ndarray:
    n_remove = int((array.shape[0] * array.shape[1] * rate)/n_timestamp)
    
    if n_features == None :
        n_features = range(5)


    for i in range(n_remove) :
        intervalle_index = np.random.randint(0, array.shape[1])
        encounter_index = np.random.randint(0, array.shape[0])
        max_intervalle = intervalle_index+n_timestamp
        if max_intervalle > array.shape[1]-1 :
            max_intervalle = array.shape[1]-1
            
        array[encounter_index,intervalle_index:max_intervalle, n_features ] = np.nan

    return array

# Préparation du jeu de données

In [None]:
def prepared_dataset(df, mask=None, rate : float=0.3, n_timestamp : int=1, n_features : list = None) :
    """
    Prépare un dataset en appliquant diverses transformations :
    - Standardisation
    - Reshape en tableau 3D
    - Application de masques spécifiques
    - Vérification des proportions maximales de valeurs manquantes après masquage
    
    Args:
        df (pd.DataFrame): Données d'entrée avec colonnes 'encounterId' et 'intervalle'.
        mask (function): Fonction de masquage à appliquer (facultatif).
        rate (float): Taux minimal de valeurs manquantes autorisé par variable pour un patient.
        n_timestamp (int): Nombre de timestamps à retirer si remove_timestamp est utilisé.
        n_features (list): Liste des features à modifier si remove_timestamp est utilisé.
    
    Returns:
        numpy.ndarray: Tableau 3D transformé et éventuellement masqué.
    """
    n_samples = df['encounterId'].unique().shape[0]
    dropped_df = df.drop(['encounterId','intervalle'], axis=1)
    standardized = StandardScaler().fit_transform(dropped_df.to_numpy())
    reshaped = standardized.reshape(n_samples, 48, -1)

    # Vérification des proportions maximales de valeurs manquantes

    if mask == random_mask:
        masked = random_mask(reshaped, rate)
    elif mask == remove_timestamp :
        masked = remove_timestamp(reshaped, rate, n_timestamp, n_features)
    elif mask == None :
        masked =  reshaped
    else :
        raise ValueError("Aucune correspondance concernant la fonction de masquage.")

    return masked

In [None]:
Train = prepared_dataset(train_data)
Train_mcar = prepared_dataset(train_data, mask=random_mask, rate=0.3)
Val = prepared_dataset(val_data)
Val_mcar = prepared_dataset(val_data, mask=random_mask, rate=0.3)
Test_ori = prepared_dataset(test_data, mask=None)
Test_mcar = prepared_dataset(test_data, mask=random_mask, rate=0.3)
Test_single_row = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3) # missing intervalles
Test_two_rows = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=2)
Test_three_rows = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=3)
Test_hr = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[0])
Test_sp02 = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[1])
Test_fr = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[2])
Test_pa = prepared_dataset(test_data, mask=remove_timestamp, rate=0.3, n_timestamp=4, n_features=[3,4,5])


In [None]:
Train.shape

# Définition des méthodes d'imputation

## Pypots models

In [None]:
datasets = {
    "X": Train,
    "missing_mask": Train_mcar,
    "val_data" : {
        "X": Val,
        "missing_mask": Val_mcar
    }
}

In [None]:
n_steps = 48
n_features = 6
device = "cuda"
n_epochs = 30

In [None]:
Train.shape == Train_mcar.shape

### Entrainement SAITS

https://github.com/WenjieDu/SAITS

In [None]:
Train.shape

In [None]:
saits = SAITS(
    n_steps=n_steps, n_features=n_features,
    n_layers=3, d_model=512, d_ffn=128, n_heads=8, d_k=64, d_v=64,
    dropout=0.1,
    epochs=n_epochs,
    device=device,
    saving_path= MODEL_FOLDER + 'saits/model.pth',  # Stratégie de partage entre groupes
    diagonal_attention_mask = True
    )

In [None]:
model_path_saits = MODEL_FOLDER + "saits/saits_two_days_with_val.pypots"
try :
    saits.load(model_path_saits)
except AssertionError :
    print('model not found')
    pass
if fit_model :
    saits.fit(datasets)

In [None]:
saits.save(model_path_saits, overwrite=True)

### BRITS

In [None]:
brits = BRITS(
    n_steps=n_steps, 
    n_features=n_features, 
    rnn_hidden_size=128, 
    epochs=n_epochs, 
    device=device,
    saving_path= MODEL_FOLDER + 'brits/model.pth'
    )

In [None]:
model_path_brits = MODEL_FOLDER + "brits/brits_two_days_with_val.pypots"
try :
    brits.load(model_path_brits)
except AssertionError :
    print('model not found')
    pass
if fit_model :
    brits.fit(datasets)

In [None]:
brits.save(model_path_brits, overwrite=True)

### USGAN

In [None]:
usgan = USGAN(
    n_steps=n_steps, 
    n_features=n_features, 
    epochs=n_epochs, 
    device=device, 
    rnn_hidden_size=128,
    saving_path= MODEL_FOLDER + 'usgan/model.pth'
    )

In [None]:
model_path_usgan = MODEL_FOLDER + "usgan/usgan_two_days_with_val.pypots"
try :
    usgan.load(model_path_usgan)
except AssertionError :
    print('model not found')
    pass
if fit_model :
    usgan.fit(datasets)

In [None]:
usgan.save(model_path_usgan, overwrite=True)

### GPVAE

In [None]:
gpvae = GPVAE(
    n_steps=n_steps, 
    n_features=n_features, 
    epochs=n_epochs, 
    device=device, 
    latent_size=64,
    saving_path= MODEL_FOLDER + 'gpvae/model.pth'
    )

In [None]:
model_path_gpvae = MODEL_FOLDER + "gpvae/gpvae_two_days_with_val.pypots"
try :
    gpvae.load(model_path_gpvae)
except AssertionError :
    print('model not found')
    pass
if fit_model :
    gpvae.fit(datasets)

In [None]:
gpvae.save(model_path_gpvae, overwrite=True)

## Forward/Backward Fill

In [None]:
def fill_missing(df):
    """
    Applique un forward fill suivi d'un backward fill sur un tableau 3D numpy.
    
    Args:
        data (numpy.ndarray): Tableau 3D (patients, timestamps, features) contenant des NaN.
    
    Returns:
        numpy.ndarray: Tableau avec les valeurs manquantes complétées.
    """
    filled_data = np.copy(df)
    series_no_values = 0
    # Forward fill
    for patient in range(filled_data.shape[0]):
        for feature in range(filled_data.shape[2]):

            pandas_df = pd.DataFrame(filled_data[patient, : , feature])
            pandas_df = pandas_df.ffill().bfill()
            filled_data[patient, :, feature] = pandas_df.values.flatten()

    
    return filled_data

## Interpolation linéaire

In [None]:
def lin_interpol(df):
    """
    Complète les valeurs manquantes dans un tableau 3D numpy.
    
    1. Impute par la moyenne si des valeurs antérieures et ultérieures existent.
    2. Forward fill si pas de données ultérieures.
    3. Backward fill si pas de données antérieures.
    
    Args:
        data (numpy.ndarray): Tableau 3D (patients, timestamps, features) contenant des NaN.
    
    Returns:
        numpy.ndarray: Tableau avec les valeurs manquantes complétées.
    """
    filled_data = np.copy(df)

    for patient in range(filled_data.shape[0]):
        for feature in range(filled_data.shape[2]):
            series = pd.Series(filled_data[patient, :, feature])

            # Étape 1 : Imputation par la moyenne (si valeurs antérieures et ultérieures existent)
            for idx in series[series.isna()].index:
                # Chercher la dernière valeur antérieure
                prev_idx = series[:idx].last_valid_index()
                # Chercher la première valeur ultérieure
                next_idx = series[idx + 1:].first_valid_index()
                
                if prev_idx is not None and next_idx is not None:
                    prev_value = series[prev_idx]
                    next_value = series[next_idx]
                    series.iloc[idx] = (prev_value + next_value) / 2

            # Étape 2 : Forward fill pour les NaN restants (pas de données ultérieures)
            series.ffill(inplace=True)

            # Étape 3 : Backward fill pour les NaN restants (pas de données antérieures)
            series.bfill(inplace=True)

            # Remplacer les données dans le tableau 3D
            filled_data[patient, :, feature] = series.values

    return filled_data


## Imputation par la moyenne/médiane

In [None]:
def impute_with_statistic(df, method="mean"):
    """
    Impute les valeurs manquantes pour chaque feature avec sa moyenne ou sa médiane.

    Args:
        data (numpy.ndarray): Tableau 3D (patients, timestamps, features) contenant des NaN.
        method (str): Méthode d'imputation ("mean" ou "median").

    Returns:
        numpy.ndarray: Tableau avec les valeurs manquantes imputées.
    """
    filled_data = np.copy(df)
    for patient in range(filled_data.shape[0]):
        for feature in range(filled_data.shape[2]):
            if method == "mean":
                filled_value = np.nanmean(filled_data[patient, :, feature])
            elif method == "median":
                filled_value = np.nanmedian(filled_data[patient, :, feature])
            else:
                raise ValueError("Méthode non reconnue. Utilisez 'mean' ou 'median'.")
            
            filled_data[:, :, feature] = np.nan_to_num(filled_data[:, :, feature], nan=filled_value)
            
    return filled_data


https://stackoverflow.com/questions/58613108/imputing-missing-values-using-sklearn-iterativeimputer-class-for-mice  
https://github.com/wendyminai/APPROACHES-TO-MISSING-DATA-IN-TIME-SERIES-

## Imputations 2D

In [None]:
# Flatten training data
n_features = Train.shape[2]
n_timestamps = Train.shape[1]
train_samples = Train.shape[0]

train_flatten = Train.reshape(-1, n_features)

### Imputation par MICE

https://stackoverflow.com/questions/58613108/imputing-missing-values-using-sklearn-iterativeimputer-class-for-mice  
https://github.com/wendyminai/APPROACHES-TO-MISSING-DATA-IN-TIME-SERIES-

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# fit IterativeImputer

imputer_mice = IterativeImputer(max_iter=30, random_state=42)
imputer_mice.fit(train_flatten)


### Imputation par KNNimputer

In [None]:
from sklearn.impute import KNNImputer

In [None]:
imputer_knn = KNNImputer(n_neighbors=2)
imputer_knn.fit(train_flatten)

### Imputation par MissForest

In [None]:
from missforest import MissForest

In [None]:
imputer_mf = MissForest()
imputer_mf.fit(train_flatten)

### Impute 3darray with 2d model

In [None]:
def impute_with_2d_model(df, model):
    n_features = df.shape[2]
    n_timestamps = df.shape[1]
    n_samples = df.shape[0]

    # Flatten data
    flatten = pd.DataFrame(df.reshape(-1, n_features))

    # Impute missing values
    filled_flatten = model.transform(flatten)
    if isinstance(filled_flatten, pd.DataFrame):
        filled_flatten = filled_flatten.to_numpy()
    # Reshape data
    filled_data = filled_flatten.reshape(n_samples, n_timestamps, n_features)

    return filled_data

# Script

### Conditions valeurs manquantes

In [None]:
conditions = [('Random' , Test_mcar), 
              ('Single_row' , Test_single_row), ('Two_rows', Test_two_rows), ('Three_rows', Test_three_rows), ('fr_only',Test_fr),
              ('hr_only',Test_hr), ('pa_only',Test_pa),('spO2_only',Test_sp02)]

In [None]:
condition_test = [('Random' , Test_mcar)]

## Boucles méthodes d'imputation et scénarios

In [None]:
results_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}

### Global

In [None]:
#  Temps d'imputation de chaque modèle

time_start = time.time()


for c in conditions :
    time_cond = time.time()
    print(f'----------------------------{c[0]}----------------------------')
    imputation_saits = saits.impute({'X':c[1]})
    saits_time = time.time() - time_cond
    print(f'Saits time : {saits_time}')
    imputation_brits = brits.impute({'X':c[1]})
    brits_time = time.time() - saits_time
    print(f'Brits time : {brits_time}')
    imputation_usgan = usgan.impute({'X':c[1]})
    usgan_time = time.time() - brits_time
    print(f'Usgan time : {usgan_time}')
    imputation_gpvae = gpvae.impute({'X':c[1]}).mean(axis=1)
    gpvae_time = time.time() - usgan_time
    print(f'Gpvae time : {gpvae_time}')
    imputation_mice = impute_with_2d_model(c[1], imputer_mice)
    mice_time = time.time() - gpvae_time
    print(f'Mice time : {mice_time}')
    imputation_mf = impute_with_2d_model(c[1], imputer_mf)
    mf_time = time.time() - mice_time
    print(f'Mf time : {mf_time}')
    imputation_fill = fill_missing(c[1])
    fill_time = time.time() - mf_time
    print(f'Fill time : {fill_time}')
    imputation_average_or_fill = lin_interpol(c[1])
    average_time = time.time() - fill_time
    print(f'Average time : {average_time}')
    imputation_mean = impute_with_statistic(c[1])
    mean_time = time.time() - average_time
    print(f'Mean time : {mean_time}')
    imputation_median = impute_with_statistic(c[1], method='median')
    median_time = time.time() - mean_time
    print(f'Median time : {median_time}')
    imputed_datasets = [
        ('fill', imputation_fill),
        ('mean', imputation_mean),
        ('median', imputation_median),
        ('imputation_average', imputation_average_or_fill),
        ('mice', imputation_mice),
        ('mf', imputation_mf),
        ('saits', imputation_saits),
        ('brits', imputation_brits),
        ('usgan', imputation_usgan),
        ('gpvae', imputation_gpvae)
    ]
    print('imputation done')
    full_imput_time = time.time() - time_cond
    print(f'Imputation time : {full_imput_time}')
    for i in imputed_datasets :
        print(f'---------{i[0]}---------')
        indicating_mask_test = np.isnan(c[1]) ^ np.isnan(Test_ori)
        mae_test = calc_mae(i[1], np.nan_to_num(Test_ori), indicating_mask_test)
        rmse_test = calc_rmse(i[1], np.nan_to_num(Test_ori), indicating_mask_test)
        print(f'{c[0]} imputed with {i[0]} : MAE = {mae_test} / RMSE = {rmse_test}')
        results_mae[i[0]][c[0]] = mae_test
        results_rmse[i[0]][c[0]] = rmse_test
results_df_mae = pd.DataFrame(results_mae)
results_df_rmse = pd.DataFrame(results_rmse)

In [None]:
round(results_df_mae.T, 3)

In [None]:
round(results_df_rmse.T, 3)

### Per Feature

In [None]:
results_fr_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_hr_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pam_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pad_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pas_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_sp02_mae = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}

In [None]:
results_fr_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_hr_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pam_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pad_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_pas_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}
results_sp02_rmse = {method: {} for method in ['mean', 'median', 'imputation_average', 'fill', 'mice', 'mf', 'saits', 'brits', 'usgan', 'gpvae']}

In [None]:
df_features_mae = [results_hr_mae,results_sp02_mae, results_fr_mae,  results_pad_mae, results_pam_mae, results_pas_mae]

In [None]:
df_features_rmse = [results_hr_rmse,results_sp02_rmse, results_fr_rmse,  results_pad_rmse, results_pam_rmse, results_pas_rmse]

In [None]:

time_start = time.time()

unscaled_df = test_data.drop(['encounterId','intervalle'], axis=1).to_numpy()
scaler= StandardScaler().fit(unscaled_df)

for c in conditions :

    descaled_cond = scaler.inverse_transform(c[1].reshape(-1, n_features))

    time_cond = time.time()
    print(f'----------------------------{c[0]}----------------------------')
    imputation_saits = saits.impute({'X':c[1]})
    saits_time = time.time() - time_cond

    imputation_brits = brits.impute({'X':c[1]})
    brits_time = time.time() - saits_time

    imputation_usgan = usgan.impute({'X':c[1]})
    usgan_time = time.time() - brits_time

    imputation_gpvae = gpvae.impute({'X':c[1]}).mean(axis=1)
    gpvae_time = time.time() - usgan_time

    imputation_mice = impute_with_2d_model(c[1], imputer_mice)
    mice_time = time.time() - gpvae_time
 
    imputation_mf = impute_with_2d_model(c[1], imputer_mf)
    mf_time = time.time() - mice_time

    imputation_fill = fill_missing(c[1])
    fill_time = time.time() - mf_time

    imputation_average_or_fill = lin_interpol(c[1])
    average_time = time.time() - fill_time
 
    imputation_mean = impute_with_statistic(c[1])
    mean_time = time.time() - average_time
    impute_with_statistic(c[1], method='median')
    median_time = time.time() - mean_time

    imputed_datasets = [
        ('fill', imputation_fill),
        ('mean', imputation_mean),
        ('median', imputation_median),
        ('imputation_average', imputation_average_or_fill),
        ('mice', imputation_mice),
        ('mf', imputation_mf),
        ('saits', imputation_saits),
        ('brits', imputation_brits),
        ('usgan', imputation_usgan),
        ('gpvae', imputation_gpvae)
    ]
    print('imputation done')
    full_imput_time = time.time() - time_cond

    for idx, feat in enumerate(df_features_mae) :

        for i in imputed_datasets :
            imputed_descaled = scaler.inverse_transform(i[1].reshape(-1, n_features))
    
            indicating_mask_test = np.isnan(descaled_cond[:,idx]) ^ np.isnan(unscaled_df[:,idx])
            mae_test = calc_mae(imputed_descaled[:,idx], np.nan_to_num(unscaled_df[:,idx]), indicating_mask_test)
            rmse_test = calc_rmse(imputed_descaled[:,idx], np.nan_to_num(unscaled_df[:,idx]), indicating_mask_test)

            df_features_mae[idx][i[0]][c[0]] = mae_test
            df_features_rmse[idx][i[0]][c[0]] = rmse_test


In [None]:
data

In [None]:
df_features_rmse[4]['saits']

### Save results

In [None]:
df_features_mae[0]

In [None]:
feature_index = ['heart_rate', 'spo2', 'fr', 'pad', 'pam', 'pas']
for idx, i in enumerate(feature_index) :
    print(i)
    pd.DataFrame(df_features_mae[idx]).to_excel(OUTPUT_TABLE + f'mae_per_feature/feature_{i}_mae.xlsx')
    pd.DataFrame(df_features_rmse[idx]).to_excel(OUTPUT_TABLE + f'rmse_per_feature/feature_{i}_rmse.xlsx')


In [None]:
results_df_mae.T.to_excel(OUTPUT_TABLE + 'results_global_imputation_mae.xlsx')
results_df_rmse.T.to_excel(OUTPUT_TABLE + 'results_global_imputation_rmse.xlsx')

# Impute Dataset (SAITS)

In [None]:
data_id_intervalle = data[['encounterId', 'intervalle']]
data_features = data[['fr', 'heart_rate', 'pam', 'pad', 'pas','spo2']]

In [None]:
def reshaped_inverse_scaler(original_dataset, imputed_dataset):

    n_features = 6
    n_timestamps = 48
    n_samples = int(original_dataset.shape[0]/48)

    data_id_intervalle = original_dataset[['encounterId', 'intervalle']]
    data_features = original_dataset[['fr', 'heart_rate', 'pam', 'pad', 'pas','spo2']]
    scaler = StandardScaler().fit(data_features.to_numpy())
    data_imputed_reshaped = pd.DataFrame(scaler.inverse_transform(imputed_dataset.reshape(-1,6)), columns=['fr', 'heart_rate', 'pam', 'pad', 'pas','spo2'])
    
    return data_imputed_reshaped

In [None]:
n_samples = int(data.shape[0]/48)
scaler = StandardScaler().fit(data_features.to_numpy())
data_transformed = scaler.transform(data_features)
data_reshaped = data_transformed.reshape(n_samples, 48, -1)

In [None]:
data_imputed = saits.impute({'X':data_reshaped})

In [None]:
data_imputed.shape

In [None]:
data_imputed_reshaped = pd.DataFrame(scaler.inverse_transform(data_imputed.reshape(-1,6)), columns=['fr', 'heart_rate', 'pam', 'pad', 'pas','spo2'])

In [None]:
data_imputed_reshaped.shape[0]/48

In [None]:
data_imputed_reshaped['encounterId'] = data_id_intervalle['encounterId']
data_imputed_reshaped['intervalle'] = data_id_intervalle['intervalle']

In [None]:
data_id_intervalle['encounterId'].value_counts(dropna=False)

In [None]:
data_imputed_reshaped.fr.shape[0]/48

In [None]:
data_imputed_reshaped = data_imputed_reshaped[['encounterId', 'intervalle', 'fr', 'heart_rate', 'spo2', 'pad', 'pam', 'pas']]

In [None]:
data_imputed_reshaped[data_imputed_reshaped['intervalle'].isna()]

In [None]:
data_imputed_reshaped.to_parquet(OUTPUT_TABLE + 'first_48_imputed_saits.parquet')

# Analyse de la répétabilité

In [None]:
unscaled_df = test_data.drop(['encounterId', 'intervalle'], axis=1).to_numpy()
test_data.to_parquet(OUTPUT_DATASET + 'original.parquet')
scaler = StandardScaler().fit(unscaled_df)

# Étape 1: Filtrer les scénarios pour exclure ceux non pertinents
filtered_conditions = [('pa_only',Test_pa)
]

# Étape 2: Préparer les datasets pour lin_interpol et SAITS
datasets = {'lin_interpol': {}, 'saits': {}, 'mean' : {}}

In [None]:
df = pd.DataFrame({})

In [None]:
for scenario_name, scenario_data in filtered_conditions :
    print(scenario_name)
    descaled_cond = scaler.inverse_transform(scenario_data.reshape(-1, n_features))

    imputation_saits = saits.impute({'X':scenario_data})

    imputation_lin_interpol = lin_interpol(scenario_data)

    imputation_moy = impute_with_statistic(scenario_data)

    imputed_datasets = [
        ('imputation_average', imputation_lin_interpol),
        ('saits', imputation_saits),
        ('mean', imputation_moy)

    ]
    print('imputation done')
    
    idx_pam = 4

    indicating_mask_test = np.isnan(descaled_cond[:,idx_pam]) ^ np.isnan(unscaled_df[:,idx_pam])
    original_values = unscaled_df[:,idx_pam][indicating_mask_test]
    
    saits_descaled = scaler.inverse_transform(imputation_saits.reshape(-1, n_features))
    saits_pam = saits_descaled[:,idx_pam][indicating_mask_test]

    datasets['saits'][scenario_name] = pd.DataFrame({
        'masquées': original_values,  # Valeurs masquées (avec NaN)
        'imputées': saits_pam   # Valeurs imputées par SAITS
    })

    lin_interpol_descaled = scaler.inverse_transform(imputation_lin_interpol.reshape(-1, n_features))
    lin_interpol_pam = lin_interpol_descaled[:,idx_pam][indicating_mask_test]

    datasets['lin_interpol'][scenario_name] = pd.DataFrame({
        'masquées': original_values,  # Valeurs masquées (avec NaN)
        'imputées': lin_interpol_pam   # Valeurs imputées par SAITS
    })

    mean_descaled = scaler.inverse_transform(imputation_moy.reshape(-1, n_features))
    mean_pam = mean_descaled[:,idx_pam][indicating_mask_test]

    datasets['mean'][scenario_name] = pd.DataFrame({
        'masquées': original_values,  # Valeurs masquées (avec NaN)
        'imputées': mean_pam   # Valeurs imputées par SAITS
    })

    descaled_cond = pd.DataFrame(descaled_cond, columns=['hr', 'spo2', 'fr', 'pad', 'pam', 'pas'])
    descaled_cond['encounterId'] = test_data['encounterId']
    descaled_cond['intervalle'] = test_data['intervalle']
    descaled_cond.to_parquet(OUTPUT_DATASET + f'{scenario_name}_masked.parquet')

    imputation_saits = pd.DataFrame(saits_descaled, columns=['hr', 'spo2', 'fr', 'pad', 'pam', 'pas'])
    imputation_saits['encounterId'] = test_data['encounterId']
    imputation_saits['intervalle'] = test_data['intervalle']
    imputation_saits.to_parquet(OUTPUT_DATASET + f'{scenario_name}_imputed_saits.parquet')

    imputation_lin_interpol = pd.DataFrame(lin_interpol_descaled, columns=['hr', 'spo2', 'fr', 'pad', 'pam', 'pas'])
    imputation_lin_interpol['encounterId'] = test_data['encounterId']
    imputation_lin_interpol['intervalle'] = test_data['intervalle']
    imputation_lin_interpol.to_parquet(OUTPUT_DATASET + f'{scenario_name}_imputed_lin_interpol.parquet')

    imputation_moy = pd.DataFrame(mean_descaled, columns=['hr', 'spo2', 'fr', 'pad', 'pam', 'pas'])
    imputation_moy['encounterId'] = test_data['encounterId']
    imputation_moy['intervalle'] = test_data['intervalle']
    imputation_moy.to_parquet(OUTPUT_DATASET + f'{scenario_name}_imputed_mean.parquet')

In [None]:
datasets

In [None]:
# Étape 3: Calculer les moyennes et différences
for method, scenario_data in datasets.items():
    for scenario_name, df in scenario_data.items():
        df['moyenne'] = (df['masquées'] + df['imputées']) / 2
        df['différence'] = df['imputées'] - df['masquées']


In [None]:
output_dir = OUTPUT_TABLE + 'pam_comparaison/'

for method, scenario_data in datasets.items():
    for scenario_name, df in scenario_data.items():
        filtered_df = df.dropna(subset=['masquées'])
        file_name = f"{scenario_name}_{method}.xlsx"
        filtered_df.to_excel(output_dir + file_name, index=False)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Fonction combinée pour tracer les deux graphiques avec un titre commun
def plot_combined_graphs(df, method, scenario_name):
    # Créer une figure avec deux sous-graphiques côte à côte
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Ajouter un titre commun à la figure
    fig.suptitle(f'{method} - {scenario_name}', fontsize=16, y=1.02)

    # Sous-graphe 1 : Bland et Altman
    axes[0].scatter(df['masquées'], df['différence'], alpha=0.5, label='Points')
    
    mean_diff = np.mean(df['différence'])
    std_diff = np.std(df['différence'])
    upper_limit = mean_diff + 1.96 * std_diff
    lower_limit = mean_diff - 1.96 * std_diff
    
    axes[0].axhline(mean_diff, color='red', linestyle='--', label=f'Moyenne des différences ({mean_diff:.2f})')
    axes[0].axhline(upper_limit, color='blue', linestyle='--', label=f'Limite supérieure ({upper_limit:.2f})')
    axes[0].axhline(lower_limit, color='blue', linestyle='--', label=f'Limite inférieure ({lower_limit:.2f})')
    
    axes[0].set_title('Bland et Altman')
    axes[0].set_xlabel('Moyenne des valeurs (masquées et imputées)')
    axes[0].set_ylabel('Différence (imputées - masquées)')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Sous-graphe 2 : Valeurs imputées en fonction des valeurs masquées
    axes[1].scatter(df['masquées'], df['imputées'], alpha=0.5, label='Points')
    
    axes[1].set_title('Valeurs imputées vs masquées')
    axes[1].set_xlabel('Valeurs masquées')
    axes[1].set_ylabel('Valeurs imputées')
    axes[1].legend()
    axes[1].grid(alpha=0.3)
    
    # Ajuster l'espacement entre les graphiques et le titre commun
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()


In [None]:
for method, scenario_data in datasets.items():
    for scenario_name, df in scenario_data.items():
        plot_combined_graphs(df, method, scenario_name)

In [None]:
print(DATASET)