# Imports

In [None]:
import polars as pl
import json
import pandas as pd
from tqdm import tqdm
import numpy as np

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION, DATA_FOLD = params['dataset'], params['version'], params['data_folder']

print(f'Working on {DATASET} dataset {VERSION}')

In [None]:
df = pl.read_parquet(f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/first_48h.parquet').to_pandas()

In [None]:
df.head()

# Window selection

In [None]:
results = {}

# tqdm sur la boucle principale
for n in tqdm(range(1, 49), desc="Calcul en cours", unit="taille"):
    count = 0

    for _, group in df.groupby('encounterId'):
        group_sorted = group.sort_values('intervalle')
        complete = (group_sorted['total_missing'] == 0).astype(int).to_numpy()
        
        rolling_sums = pd.Series(complete).rolling(window=n).sum()
        count += (rolling_sums == n).sum()

    results[n] = count

In [None]:
import matplotlib.pyplot as plt

# Si ce n'est pas déjà fait : results = {n: count} calculé auparavant

# Extraire les valeurs pour le graphique
n_values = list(results.keys())
interval_counts = list(results.values())

# Tracer le graphique
plt.figure(figsize=(10, 6))
plt.plot(n_values, interval_counts, marker='o', linestyle='-', color='royalblue')
#plt.title("Nombre d'intervalles complets selon la taille de la fenêtre", fontsize=14)
plt.xlabel("Taille de l'intervalle (n timestamps consécutifs)", fontsize=12)
plt.ylabel("Nombre d'intervalles complets", fontsize=12)
plt.grid(True)
plt.xticks(range(0, 49, 2))  # pour lisibilité
plt.tight_layout()
plt.show()

## Dataset séries temporelles

On choisit des séries de 9 timestamps sans valeurs manquantes

In [None]:
features = ['heart_rate', 'spo2', 'fr', 'pam']

df_sorted = df.sort_values(by=['encounterId', 'intervalle'])

windows_list = []  
ts_ids = []       
ts_counter = 0

# Parcourir chaque groupe d'éncounters avec une barre de progression
for encounter_id, group in tqdm(df_sorted.groupby('encounterId'), desc="Processing encounters"):
    group_sorted = group.sort_values('intervalle').reset_index(drop=True)
    
    # Conversion en array NumPy
    group_array = group_sorted[features].to_numpy()
    n = group_array.shape[0]
    
    # fenêtre de 9 timestamps
    for i in range(n - 8):
        window = group_array[i:i+9]  # Fenêtre de taille 9 x len(features)
        # Vérification de la présence de NaN
        if np.isnan(window).any():
            continue
        windows_list.append(window)
        ts_ids.append(ts_counter)  
        ts_counter += 1

array_3d = np.stack(windows_list, axis=0)
print("Nombre de séries temporelles retenues :", array_3d.shape[0])
print("Dimensions de X :", array_3d.shape)

In [None]:
save_path = f'{DATA_FOLD}/{VERSION}/3.analysis/outliers/{DATASET}/datasets'
np.save(save_path, array_3d)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

scaled_df = scaler.fit_transform(array_3d.reshape(-1,4)).reshape(-1,9,4)

In [None]:
np.random.seed(42)
random_idx = np.random.randint(array_3d.shape[0], size=30000)
learning_idx, test_idx = random_idx[:20000], random_idx[20000:]
learning_set, test_set = scaled_df[learning_idx,:,:], scaled_df[test_idx,:,:]

In [None]:
X = learning_set.copy()
X_masked = test_set.copy()
X_masked[:, 4, :] = np.nan

In [None]:
from pypots.imputation import SAITS

In [None]:
saits = SAITS(
    n_steps=9,
    n_features=4,
    n_layers=3, d_model=512, d_ffn=128, n_heads=8, d_k=64, d_v=64,
    dropout=0.1,
    device="cuda",  # change to "cuda" si GPU dispo,
    epochs=100
)

In [None]:
saits.fit({'X' : X})

In [None]:
saits.save(f'{DATA_FOLD}/{VERSION}/4.models/outliers/{DATASET}/windows_9_timestamps.pypots')

In [None]:
X_imputed = saits.impute({'X':X_masked})

In [None]:
mse_per_sample = np.mean((test_set[:,4,:] - X_imputed[:,4,:])**2, axis=1)

In [None]:
mse_per_sample.mean()

In [None]:
from tslearn.clustering import TimeSeriesKMeans

inertias = []
range_n = range(2, 10)

for k in range_n:
    km = TimeSeriesKMeans(n_clusters=k, metric="dtw", random_state=42)
    km.fit(test_set)
    inertias.append(km.inertia_)



In [None]:
import matplotlib.pyplot as plt

plt.plot(range_n, inertias, marker='o')
plt.title("Méthode du coude (DTW Inertia)")
plt.xlabel("Nombre de clusters")
plt.ylabel("Inertie (DTW)")
plt.grid(True)
plt.show()

In [None]:
model_cluster = TimeSeriesKMeans(n_clusters=4, metric="dtw", random_state=0)
clusters = model_cluster.fit_predict(X_masked)

In [None]:
clusters = km.predict(test_set)

In [None]:
mse_per_sample

In [None]:
import seaborn as sns
sns.boxplot(x=clusters, y=mse_per_sample)
plt.title("Erreur d'imputation (MSE) vs Forme de la série temporelle")
plt.xlabel("Cluster de forme (DTW)")
plt.ylabel("MSE de l'imputation")
plt.show()



In [None]:
# --------------------
# 8. (Optionnel) Visualisation des prototypes de chaque cluster
# --------------------
for i in range(4):
    plt.figure()
    plt.title(f"Prototype du cluster {i}")
    for f in range(4):
        plt.plot(km.cluster_centers_[i][:, f], label=f"Signal {f+1}")
    plt.legend()
    plt.show()