## Imports

In [None]:
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler



In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version']

In [None]:
INPUT_FILE = f'/data2/poette.m/dypo/{VERSION}/3.analysis/times_series/{DATASET}/one_week.parquet'
temporal_week = pl.read_parquet(INPUT_FILE)

In [None]:
temporal_week = temporal_week.drop('temp')

In [None]:
temporal_week

## Création de la colonne total_missing

In [None]:
features_col = list(set(temporal_week.columns) - set(['encounterId', 'intervalle']))

temporal_week = temporal_week.with_columns(
    pl.sum_horizontal(
        [temporal_week[col].is_null() for col in features_col]
    ).alias("total_missing")
)

## Récupération du dernier intervalle contenant des valeurs pour chaque patient

In [None]:
# Étape 1 : Identifier le dernier intervalle valide pour chaque patient
max_valid_intervals = (
    temporal_week.filter(pl.col("total_missing") < 7)
    .group_by("encounterId")
    .agg(pl.col("intervalle").max().alias("max_valid_interval"))
)

# Étape 2 : Supprimer les lignes inutiles
cleaned_df = (
    temporal_week.join(max_valid_intervals, on="encounterId", how="inner")
    .filter(pl.col("intervalle") <= pl.col("max_valid_interval"))
    .drop("max_valid_interval")  # Supprimer la colonne temporaire
)

# Étape 3 : Trier les patients par leur intervalle max
sorted_encounter_ids = (
    max_valid_intervals.sort("max_valid_interval", descending=True)
    .with_row_index(name="sort_order")
)

# Ajouter un ordre de tri directement via une jointure
cleaned_df = (
    cleaned_df.join(sorted_encounter_ids, on="encounterId", how="left")
    #.filter(pl.col("max_valid_interval") > 10)  # Supprimer les patients ayant un intervalle max faible
    .sort(["sort_order", "intervalle"])
    .drop("sort_order")
    .select(['encounterId', 'intervalle', 'heart_rate', 'spo2', 'fr', 'pad', 'pam', 'pas', 'total_missing', 'max_valid_interval'])
)

cleaned_df

In [None]:
cleaned_df.n_unique('encounterId')

## Création dataset des premières 48h

In [None]:
first_48h = cleaned_df.filter(
    (pl.col('max_valid_interval') > 47),
    (pl.col('intervalle') < 48)
    )

In [None]:
first_48h.n_unique('encounterId')

In [None]:
first_48h = first_48h.to_pandas()

In [None]:
# delete encounters with too many missing values
no_encounter = first_48h.columns.difference(['encounterId', 'intevralle'])
first_48h_null = first_48h[no_encounter].isnull()
first_48h_null['encounterId'] = first_48h['encounterId']
first_48h_regroup = first_48h_null.groupby('encounterId').sum()
theshold_missing = first_48h_regroup[first_48h_regroup > 14].dropna(axis = 0, how = 'all')
encounters_with_missing = theshold_missing.reset_index()['encounterId'].to_list()
print(len(encounters_with_missing))

missing_first_48h =  first_48h[first_48h['encounterId'].isin(encounters_with_missing)]

first_48h = first_48h[~first_48h.encounterId.isin(encounters_with_missing)]

In [None]:
first_48h

In [None]:
round(first_48h[['fr', 'heart_rate', 'pam', 'pad', 'pas', 'spo2']].isna().sum(),1)

In [None]:
first_48h.encounterId.nunique()

In [None]:
first_48h.to_parquet(f'/data2/poette.m/dypo/{VERSION}/3.analysis/imputation_48/{DATASET}/first_48h.parquet')