# Imports

In [None]:
import polars as pl
import json
import pandas as pd
from tqdm import tqdm
import numpy as np

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION, DATA_FOLD = params['dataset'], params['version'], params['data_folder']

print(f'Working on {DATASET} dataset {VERSION}')

# Data

In [None]:
df = pl.read_parquet(f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/first_48h.parquet').to_pandas()

In [None]:
df.head()

# Window length selection

In [None]:
results = {}

for n in tqdm(range(1, 49), desc="Calcul en cours", unit="taille"):
    count = 0

    for _, group in df.groupby('encounterId'):
        group_sorted = group.sort_values('intervalle')
        complete = (group_sorted['total_missing'] == 0).astype(int).to_numpy()
        
        rolling_sums = pd.Series(complete).rolling(window=n).sum()
        count += (rolling_sums == n).sum()

    results[n] = count

In [None]:
import matplotlib.pyplot as plt


n_values = list(results.keys())
interval_counts = list(results.values())


plt.figure(figsize=(10, 6))
plt.plot(n_values, interval_counts, marker='o', linestyle='-', color='royalblue')

plt.xlabel("Taille de l'intervalle (n timestamps consécutifs)", fontsize=12)
plt.ylabel("Nombre d'intervalles complets", fontsize=12)
plt.grid(True)
plt.xticks(range(0, 49, 2))  # pour lisibilité
plt.tight_layout()
plt.show()

On choisit des séries de 9 timestamps sans valeurs manquantes

## Dataset séries temporelles

In [None]:
features = ['heart_rate', 'spo2', 'fr', 'pam']



save_path = f'{DATA_FOLD}/{VERSION}/3.analysis/outliers/{DATASET}/datasets'

try :
    array_3d = np.load(save_path + '/datasets.npy')

except :
    df_sorted = df.sort_values(by=['encounterId', 'intervalle'])
    windows_list = []  
    ts_ids = []       
    ts_counter = 0 

    for encounter_id, group in tqdm(df_sorted.groupby('encounterId'), desc="Processing encounters"):
        group_sorted = group.sort_values('intervalle').reset_index(drop=True)
        

        group_array = group_sorted[features].to_numpy()
        n = group_array.shape[0]
        

        for i in range(n - 24):
            window = group_array[i:i+25]  

            if np.isnan(window).any():
                continue
            windows_list.append(window)
            ts_ids.append(ts_counter)  
            ts_counter += 1

    array_3d_24 = np.stack(windows_list, axis=0)
    
    np.save(save_path, array_3d_24)

print("Nombre de séries temporelles retenues :", array_3d.shape[0])
print("Dimensions de X :", array_3d.shape)

In [None]:
print("Nombre de séries temporelles retenues :", array_3d_24.shape[0])
print("Dimensions de X :", array_3d_24.shape)

In [None]:
save_path = f'{DATA_FOLD}/{VERSION}/3.analysis/outliers/{DATASET}/datasets'
np.save(save_path, array_3d)

In [None]:
array_3d = array_3d_24

# Data preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

scaled_df = scaler.fit_transform(array_3d.reshape(-1,4)).reshape(-1,25,4)

In [None]:
np.random.seed(42)
random_idx = np.random.randint(array_3d.shape[0], size=30000)
learning_idx, test_idx = random_idx[:20000], random_idx[20000:]
learning_set, test_set = scaled_df[learning_idx,:,:], scaled_df[test_idx,:,:]

In [None]:
X = learning_set.copy()
X_masked = test_set.copy()
X_masked[:, 4, :] = np.nan

# Imputation valeurs manquantes

In [None]:
from pypots.imputation import SAITS

In [None]:
saits = SAITS(
    n_steps=9,
    n_features=4,
    n_layers=3, d_model=512, d_ffn=128, n_heads=8, d_k=64, d_v=64,
    dropout=0.1,
    device="cuda",  # change to "cuda" si GPU dispo,
    epochs=100
)

In [None]:
saits.fit({'X' : X})

In [None]:
saits.save(f'{DATA_FOLD}/{VERSION}/4.models/outliers/{DATASET}/windows_9_timestamps.pypots')

In [None]:
saits.load(f'{DATA_FOLD}/{VERSION}/4.models/outliers/{DATASET}/windows_9_timestamps.pypots')

In [None]:
X_imputed = saits.impute({'X':X_masked})

# Calcul RMSE

In [None]:
rmse_per_sample = np.sqrt(np.mean((test_set[:, 4, :] - X_imputed[:, 4, :])**2, axis=1))

In [None]:
rmse_per_sample.mean()

# Clustering

In [None]:
from tslearn.clustering import TimeSeriesKMeans

## Recherche du nombre optimal de groupe

In [None]:
inertias = []
range_n = range(2, 20)

for k in range_n:
    km = TimeSeriesKMeans(n_clusters=k, metric="dtw", random_state=42)
    km.fit(test_set)
    inertias.append(km.inertia_)



In [None]:
import matplotlib.pyplot as plt

plt.plot(range_n, inertias, marker='o')
plt.title("Méthode du coude (DTW Inertia)")
plt.xlabel("Nombre de clusters")
plt.ylabel("Inertie (DTW)")
plt.grid(True)
plt.show()

# Application du clustering

In [None]:
model_cluster = TimeSeriesKMeans(n_clusters=4, metric="dtw", random_state=42)
clusters = model_cluster.fit_predict(test_set)

In [None]:
rmse_per_sample

In [None]:
import seaborn as sns
sns.boxplot(x=clusters, y=rmse_per_sample)
plt.title("Erreur d'imputation (RMSE) vs Forme de la série temporelle")
plt.xlabel("Cluster de forme (DTW)")
plt.ylabel("RMSE de l'imputation")
plt.show()

In [None]:
for i in range(4):
    plt.figure()
    plt.title(f"Prototype du cluster {i}")
    for f in range(4):
        plt.plot(km.cluster_centers_[i][:, f], label=f"Signal {f+1}")
    plt.legend()
    plt.show()