In [None]:
import json
import polars as pl
import numpy as np
from sklearn.preprocessing import StandardScaler
from pypots.imputation import SAITS
from tqdm import tqdm

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION, DATA_FOLD = params['dataset'], params['version'], params['data_folder']

print(f'Working on {DATASET} dataset {VERSION}')

In [None]:
IMPUTED_DATASET = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/tables/first_48_imputed_saits.parquet'
ORIGINAL_DATASET = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/first_48h.parquet'

In [None]:
df_imputed = pl.read_parquet(IMPUTED_DATASET)
df_original = pl.read_parquet(ORIGINAL_DATASET)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(df_imputed.to_numpy()[:,2:])

In [None]:
def to_3d_array(df):
    array_df =(
        df
            .select(pl.col('fr', 'heart_rate', 'spo2', 'pad', 'pam', 'pas'))
            .to_numpy()
    )
    array_df = scaler.transform(array_df)

    array_df = array_df.reshape(-1, 48, 6)

    return array_df

In [None]:
df_imputed_3d = to_3d_array(df_imputed)
df_original_3d = to_3d_array(df_original)

In [None]:
mask = np.isnan(df_original_3d) ^ np.isnan(df_imputed_3d)

In [None]:
n_steps, n_features = df_original_3d.shape[1], df_original_3d.shape[2]

saits = SAITS(
    n_steps=n_steps, n_features=n_features,
    n_layers=3, d_model=512, d_ffn=128, n_heads=8, d_k=64, d_v=64
    )
saits.load(path=f'{DATA_FOLD}/{VERSION}/4.models/imputation/mimic/saits/saits_two_days_with_val.pypots')

In [None]:
df_imputed_3d.shape

In [None]:
test_data = df_imputed_3d[:1000,:,:]

In [None]:
errors = np.zeros_like(test_data, dtype=float)

num_series, seq_length, num_features = test_data.shape
# i : index
# t : timestamp
# f : feature

for i in tqdm(range(num_series), desc="Processing series"):
    for t in range(seq_length):
        for f in range(num_features):
            # On ne traite que les points observés (non manquants)
            if mask[i, t, f] == 0:
                # Création d'une copie du masque
                data_temp = test_data[i].copy()
                
                # Masquer uniquement le point (t, f)
                data_temp[t, f] = np.nan  # ou une valeur de remplissage indiquant le manque
                
                data_temp = data_temp.reshape(-1, seq_length, num_features)

                imputed_series = saits.impute({'X': data_temp})
                
                # Extraire la valeur imputée pour le point (t, f)
                imputed_value = imputed_series[0, t, f]
                
                # Calcul de l'erreur (relative)
                errors[i, t, f] = (test_data[i, t, f] - imputed_value)

# 3. Normalisation des erreurs pour obtenir un score entre 0 et 1
# Option A : scaling simple
e_min = errors.min()
e_max = errors.max()
scores = (errors - e_min) / (e_max - e_min + 1e-8)  # ajout d'une petite valeur pour éviter la division par zéro

In [None]:
results_folder = f'{DATA_FOLD}/{VERSION}/3.analysis/outliers/{DATASET}'

In [None]:
error_1d = errors.reshape(-1)

In [None]:
len(np.where(np.logical_and(error_1d >-2, error_1d < 2))[0])/len(error_1d)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(error_1d)

plt.xlim(-3, 3)
plt.ylim(0,15000)

In [None]:
df_score_sample = pl.DataFrame(errors.reshape(-1, 6))
df_score_sample.write_csv(results_folder + '/datasets/saits_errors.csv')

In [None]:
display_data = scaler.inverse_transform(test_data.reshape(-1, 6))
reshaped_display_data = display_data.reshape(-1, 48, 6)

In [None]:
import math

math.erf(1)

In [None]:
np.random.choice(np.where(scores[:,:,4] > 0.8)[0])

In [None]:
df_imputed.columns

In [None]:
selected_patient = 883
selected_patient = int(np.random.choice(np.where(scores[:,:,4] > 0.5)[0]))

selected_feature = 4
timestamps = np.arange(48)

import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 8))

patient_data = reshaped_display_data[selected_patient,:,selected_feature]
error_value = errors[selected_patient,:,selected_feature]

# Premier subplot : valeurs du patient
ax1.plot(timestamps, patient_data, marker='o', label='Valeur de la variable')
ax1.set_ylabel('Valeur')
#ax1.set_ylim(20, 140)
ax1.grid(True)


# Second subplot : variation du score
ax2.plot(timestamps, abs(error_value), marker='o', color='red', label='Score')
ax2.set_title('Variation du score')
ax2.set_xlabel('Temps')
ax2.set_ylabel('Erreur')
ax2.set_ylim(0, 3)
ax2.grid(True)

# Ajustement des espacements pour éviter le chevauchement des labels/titres
plt.tight_layout()

fig.suptitle(f"patient {selected_patient}, variable {df_imputed[:,2:].columns[selected_feature]}")
# Affichage du graphique
plt.show()


In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 8))

patient_data = reshaped_display_data[selected_patient,:,selected_feature]
patient_scores = scores[selected_patient,:,selected_feature]

# Premier subplot : valeurs du patient
ax1.plot(timestamps, patient_data, marker='o', label='Valeur de la variable')
ax1.set_ylabel('Valeur')
ax1.grid(True)

# Second subplot : variation du score
ax2.plot(timestamps, patient_scores, marker='o', color='red', label='Score')
ax2.set_title('Variation du score')
ax2.set_xlabel('Temps')
ax2.set_ylabel('Score')
ax2.set_ylim(0, 1)
ax2.grid(True)

# Ajustement des espacements pour éviter le chevauchement des labels/titres
plt.tight_layout()

fig.suptitle(f"patient {selected_patient}, variable {df_imputed[:,2:].columns[selected_feature]}")
# Affichage du graphique
plt.show()
