In [None]:
import pandas as pd
import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import json
from matplotlib.colors import Normalize

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version']
DATA_FOLD = params['data_folder']

In [None]:
data = pl.read_parquet(f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/first_48h.parquet')

In [None]:
data_features = data[['fr', 'heart_rate', 'pam', 'spo2']].to_pandas()
round(int(data_features.isna().sum().sum())/(data_features.shape[0]*data_features.shape[1]), 4) *100

In [None]:
# Créer une matrice pivotée pour la heatmap
heatmap_data = (
    data.select(["intervalle", "encounterId", "total_missing"])
    .pivot(
        values="total_missing",  # Les valeurs à afficher
        index="intervalle",      # Chaque ligne correspond à un intervalle
        on="encounterId"    # Chaque colonne correspond à un patient
    )
)

# Convertir en DataFrame Pandas pour la heatmap
heatmap_matrix = heatmap_data.to_pandas()

# Remplir les valeurs manquantes (None ou null) par NaN
heatmap_matrix = heatmap_matrix.fillna(np.nan)

# Création de la colormap avec le blanc pour les NaN
cmap = plt.cm.coolwarm  # Palette principale
cmap.set_bad(color='white')  # Définir la couleur pour les valeurs manquantes

# Normalisation pour la plage 0 à 4
norm = Normalize(vmin=0, vmax=4)  # Plage fixe pour les valeurs manquantes (0 à 7)

# Masquer les NaN dans la matrice
masked_matrix = np.ma.masked_where(np.isnan(heatmap_matrix), heatmap_matrix)

# Création de la figure
plt.figure(figsize=(12, 8))
plt.imshow(masked_matrix, aspect='auto', cmap=cmap, norm=norm, origin='lower')

# Ajouter une barre de couleur
cbar = plt.colorbar(label="Nombre de variables manquantes")
cbar.set_ticks(range(4))  # Afficher les ticks de 0 à 4
cbar.set_label("Nombre de variables manquantes", rotation=270, labelpad=20)

# Ajouter des labels et un titre
plt.xlabel("Séjours")
plt.gca().invert_yaxis()
plt.ylabel("Intervalle (h)")

# Enlever la grille
plt.grid(False)

# Afficher la figure
plt.show()


In [None]:
# Calculer le nombre absolu de valeurs manquantes par variable
missing_counts = data.select([
    pl.col("fr").is_null().sum().alias("fr_missing"),
    pl.col("heart_rate").is_null().sum().alias("heart_rate_missing"),
    pl.col("spo2").is_null().sum().alias("spo2_missing"),
    pl.col("pam").is_null().sum().alias("pam_missing"),
    #pl.col("pad").is_null().sum().alias("pad_missing"),
    #pl.col("pas").is_null().sum().alias("pas_missing")
]).to_pandas()

# Calculer le pourcentage de valeurs manquantes par rapport au nombre total d'intervalles
total_intervals = data.shape[0]
missing_percentages = (missing_counts / total_intervals * 100).round(1)

# Afficher les résultats
missing_counts = missing_counts.T
missing_counts.columns = ['Missing Count']
missing_percentages = missing_percentages.T
missing_percentages.columns = ['Missing Percentage (%)']

missing_stats = pd.concat([missing_counts, missing_percentages], axis=1)
print(missing_stats)

In [None]:
md =data_features.isnull()

In [None]:
md.columns

In [None]:
md.corr()

In [None]:
corr_dict = {}
for main_col in md.columns :
    corr_dict[main_col] = []
    for col in md.columns :
       corr = round((md[main_col] & md[col]).sum()/(md[main_col].sum()),2 )
       corr_dict[main_col].append(corr)
corr_df = pd.DataFrame(corr_dict, index=md.columns)

print(corr_df)

In [None]:
for i in range(0,5) :
    total_missing = data.filter((pl.col('max_valid_interval') > 0)).filter(pl.col('total_missing') == i).shape[0]
    print(f"{i} variables manquantes = {total_missing} ({round((total_missing/data.filter((pl.col('max_valid_interval') > 0)).shape[0])*100, 1)}%)")

In [None]:
data.filter(pl.col('total_missing') > 0)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set a style suitable for academic publication
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)

# Create the figure and axes
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the count of intervals with missing data > 0
ax = sns.countplot(
    data=data.filter(pl.col('total_missing') > 0),
    x='total_missing',
    color='skyblue',      # You can change this color if you prefer
    edgecolor='black'     # Adds a subtle outline to the bars
)

# Customize labels and title
ax.set_xlabel("Missing Values per Interval", fontsize=14)
ax.set_ylabel("Number of Intervals", fontsize=14)
ax.set_title("Distribution of Missing Values per Interval", fontsize=16, fontweight='bold')

# Ensure everything fits nicely
plt.tight_layout()

# Display the figure
plt.show()


In [None]:
# Fonction pour calculer les occurrences d'intervalles consécutifs
def calculate_consecutive_intervals(data_merge, max_missing_values):
    results = {}
    df = data_merge.cast({'encounterId' : pl.Int32}).sort('encounterId', 'intervalle').to_pandas()
    
    for n in range(max_missing_values + 1):  # Tester chaque seuil de n variables manquantes
        
        df["below_threshold"] = df["total_missing"] >= n

        # Identifier les groupes consécutifs où `below_threshold` est True
        df["group"] = (df["below_threshold"] != df["below_threshold"].shift()).cumsum()
        consecutive_counts = df[df["below_threshold"]].groupby(["group", "encounterId"]).size()

        # Compter les occurrences pour chaque longueur d'intervalle
        occurrences = consecutive_counts.value_counts().sort_index()
        results[n] = occurrences

    # Convertir les résultats en DataFrame
    results_df = pd.DataFrame(results).fillna(0).astype(int)
    results_df.index.name = "Consecutive Hours"
    results_df.columns.name = "Max Missing Values"
    return results_df

# Calcul des occurrences pour les seuils de 0 à 7 variables manquantes
occurrences_df = calculate_consecutive_intervals(data, max_missing_values=4)
occurrences_df.to_excel(f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/tables/occurences_total.xlsx')
occurences_df_rel = occurrences_df
occurences_df_rel.iloc[:10]

In [None]:
# Fonction pour calculer le nombre de séjours uniques concernés
def calculate_stays_by_consecutive_intervals(df, max_missing_values):
    df = df.cast({'encounterId' : pl.Int32}).sort('encounterId', 'intervalle').to_pandas()
    results = {}

    for n in range(max_missing_values + 1):  # Tester chaque seuil de n variables manquantes
        df["below_threshold"] = df["total_missing"] >= n

        # Identifier les groupes consécutifs où `below_threshold` est True
        df["group"] = (df["below_threshold"] != df["below_threshold"].shift()).cumsum()
        consecutive_counts = df[df["below_threshold"]].groupby(["group", "encounterId"]).size()

        # Associer les groupes d'heures à leur durée
        durations = consecutive_counts.reset_index(name="duration")
        durations = durations.groupby("duration")["encounterId"].nunique()

        # Stocker le nombre de séjours concernés pour chaque durée
        results[n] = durations

    # Convertir les résultats en DataFrame
    results_df = pd.DataFrame(results).fillna(0).astype(int)
    results_df.index.name = "Consecutive Hours"
    results_df.columns.name = "Max Missing Values"
    return results_df

# Calcul des séjours concernés pour les seuils de 0 à 7 variables manquantes
stays_df = calculate_stays_by_consecutive_intervals(data, max_missing_values=4)

stays_df_rel = round(stays_df/data.unique('encounterId').shape[0]*100,1)
stays_df.to_excel(f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/tables/occurences_par_séjours.xlsx')
stays_df

In [None]:
# Configuration de Seaborn
sns.set_theme(style="whitegrid", palette="pastel")

# Création de la figure
fig, axes = plt.subplots(2, 1, figsize=(7, 10), sharex=False, sharey=False)

fig.text(0.5, 0.95, f"{DATASET.upper()}", ha='center', va='center', fontsize=14, fontweight='bold')

occurrences_df = calculate_consecutive_intervals(data, max_missing_values=4)
# Graphique 1 : Nombre d'occurrences
for n in occurrences_df.columns:
    if n:
        sns.lineplot(
            x=occurences_df_rel.index,
            y=occurences_df_rel[n],
            ax=axes[0],
            label=f"{n} variables",
            marker ="8",
            markeredgecolor='k'
        )
axes[0].set_xlabel("Durée des Intervalles Consécutifs (Heures)")
axes[0].set_ylabel("Nombre d'occurrences")
axes[0].set_xlim([0, 6])
axes[0].set_ylim([0, 40000])
axes[0].legend(title="Variables Manquantes",  fontsize=8, title_fontsize=10)
axes[0].grid(True)

stays_df = calculate_stays_by_consecutive_intervals(data, max_missing_values=4)
# Graphique 2 : Nombre de séjours concernés
for n in stays_df.columns:
    if n:
        sns.lineplot(
            x=stays_df.index,
            y=stays_df[n],
            ax=axes[1],
            label=f"{n} Variables Manquantes",
            marker="8",
            markeredgecolor='k'
        )
axes[1].set_xlabel("Durée des Intervalles Consécutifs (Heures)")
axes[1].set_ylabel("Nombre de séjours")
axes[1].set_xlim([0, 6])
axes[1].set_ylim([0, 30000])
axes[1].legend(title="Variables Manquantes",  fontsize=8, title_fontsize=10)
axes[1].grid(True)


# Ajustement de l'affichage

plt.tight_layout(rect=[0, 0, 1, 0.95])  # Ajuster pour le titre global

plt.show()
# "Analyse des Intervalles Consécutifs par Seuil de Variables Manquantes"


In [None]:
# Configuration for Seaborn
sns.set_theme(style="whitegrid", palette="pastel")

markers = ["o", "s", "D", "^", "v", "<", ">", "p", "h", "*"]

# Function to plot data on a specific axis
def plot_data(ax, df, x_label, y_label, x_limit, y_limit, legend_title, remove_yticks=False):
    for n in df.columns:
        if n:
            sns.lineplot(
                x=df.index,
                y=df[n],
                ax=ax,
                label=f"{n} Missing Variables",
                marker=markers[n],
                markeredgecolor='k',
                linestyle="-"
            )
    ax.set_xlabel(x_label)
    if not remove_yticks:
        ax.set_ylabel(y_label)
    else:
        ax.set_yticks([])
        ax.set_ylabel("")
    ax.set_xlim(x_limit)
    ax.set_ylim(y_limit)
    ax.legend(title=legend_title, fontsize=8, title_fontsize=10)
    ax.grid(False)

# Create the figure
fig, axes = plt.subplots(2, 2, figsize=(14, 10), sharex=True)

# Add subtitles above the vertical axes
fig.text(0.25, 0.95, "MIMIC-IV", ha='center', va='center', fontsize=14, fontweight='bold')
fig.text(0.75, 0.95, "CHU", ha='center', va='center', fontsize=14, fontweight='bold')

# Plot data for MIMIC-IV
occurrences_df_mimic = calculate_consecutive_intervals(data_mimic, max_missing_values=6)
plot_data(
    axes[0, 0], occurrences_df_mimic,
    x_label="Consecutive Interval Duration (Hours)",
    y_label="Number of Occurrences",
    x_limit=[0, 7],
    y_limit=[0, 60000],
    legend_title="Missing Variables"
)

stays_df_mimic = calculate_stays_by_consecutive_intervals(data_mimic, max_missing_values=6)
plot_data(
    axes[1, 0], stays_df_mimic,
    x_label="Consecutive Interval Duration (Hours)",
    y_label="Number of Stays",
    x_limit=[0, 7],
    y_limit=[0, 30000],
    legend_title="Missing Variables"
)

# Plot data for CHU (Remove yticks and ylabel)
occurrences_df_chu = calculate_consecutive_intervals(data_chu, max_missing_values=6)
plot_data(
    axes[0, 1], occurrences_df_chu,
    x_label="Consecutive Interval Duration (Hours)",
    y_label="Number of Occurrences",
    x_limit=[0, 7],
    y_limit=[0, 60000],
    legend_title="Missing Variables",
    remove_yticks=True
)

stays_df_chu = calculate_stays_by_consecutive_intervals(data_chu, max_missing_values=6)
plot_data(
    axes[1, 1], stays_df_chu,
    x_label="Consecutive Interval Duration (Hours)",
    y_label="Number of Stays",
    x_limit=[0, 7],
    y_limit=[0, 30000],
    legend_title="Missing Variables",
    remove_yticks=True
)

# Adjust layout
plt.tight_layout(rect=[0, 0, 1, 0.9])  # Adjust for the global title
plt.show()


In [None]:
occurrences_df = calculate_consecutive_intervals(data_chu, max_missing_values=6)
# Graphique 1 : Nombre d'occurrences
for n in occurrences_df.columns:
    if n:
        sns.lineplot(
            x=occurences_df_rel.index,
            y=occurences_df_rel[n],
            ax=axes[0,1],
            label=f"{n} variables",
            marker ="8",
            markeredgecolor='k'
        )
axes[0,1].set_xlabel("Durée des Intervalles Consécutifs (Heures)")
axes[0,1].set_ylabel("Nombre d'occurrences")
axes[0,1].set_xlim([0, 7])
axes[0,1].set_ylim([0, 60000])
axes[0,1].legend(title="Variables Manquantes",  fontsize=8, title_fontsize=10)
axes[0,1].grid(True)

stays_df = calculate_stays_by_consecutive_intervals(data_chu, max_missing_values=6)
# Graphique 2 : Nombre de séjours concernés
for n in stays_df.columns:
    if n:
        sns.lineplot(
            x=stays_df.index,
            y=stays_df[n],
            ax=axes[1,1],
            label=f"{n} Variables Manquantes",
            marker="8",
            markeredgecolor='k'
        )
axes[1,1].set_xlabel("Durée des Intervalles Consécutifs (Heures)")
axes[1,1].set_ylabel("Nombre de séjours")
axes[1,1].set_xlim([0, 7])
axes[1,1].set_ylim([0, 30000])
axes[1,1].legend(title="Variables Manquantes",  fontsize=8, title_fontsize=10)
axes[1,1].grid(True)

In [None]:
# Calculer la moyenne des données manquantes pour chaque intervalle de temps et chaque variable
missing_means = (
    data
    .group_by("intervalle")
    .agg([
        pl.col("fr").is_null().cast(pl.UInt32).mean().alias("mean_fr_missing"),
        pl.col("heart_rate").is_null().cast(pl.UInt32).mean().alias("mean_heart_rate_missing"),
        pl.col("spo2").is_null().cast(pl.UInt32).mean().alias("mean_spo2_missing"),
        pl.col("pam").is_null().cast(pl.UInt32).mean().alias("mean_pam_missing"),
        pl.col("total_missing").mean().alias("mean_total_missing")
    ])
    .sort("intervalle")
)

# Convertir en DataFrame Pandas pour la visualisation
missing_means_df = missing_means.to_pandas()

# Tracer le lineplot avec Seaborn
plt.figure(figsize=(12, 6))
sns.lineplot(data=missing_means_df, x="intervalle", y="mean_fr_missing", label="Missing RR")
sns.lineplot(data=missing_means_df, x="intervalle", y="mean_heart_rate_missing", label="Missing HR")
sns.lineplot(data=missing_means_df, x="intervalle", y="mean_spo2_missing", label="Missing SpO2")
sns.lineplot(data=missing_means_df, x="intervalle", y="mean_pam_missing", label="Missing MBP")
sns.lineplot(data=missing_means_df, x="intervalle", y="mean_total_missing", 
             label="Total Missing", linestyle="--", linewidth=2.5, color='black')

# Ajouter des labels et un titre
plt.xlabel("Time Interval (hours)")
plt.ylabel("Average Number of Missing Data")
plt.title("Average Number of Missing Data Over Time")
plt.legend()
plt.grid(False)

# Afficher le graphique
plt.show()


In [None]:
data_chu

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl

# Example data processing for MIMIC and CHU (replace with actual data)
# mimic_data = ...
# chu_data = ...

# Function to calculate mean missing data for each intervalle and variable
def calculate_missing_means(data):
    return (
        data
        .group_by("intervalle")
        .agg([
            pl.col("fr").is_null().cast(pl.UInt32).mean().alias("mean_fr_missing"),
            pl.col("heart_rate").is_null().cast(pl.UInt32).mean().alias("mean_heart_rate_missing"),
            pl.col("spo2").is_null().cast(pl.UInt32).mean().alias("mean_spo2_missing"),
            pl.col("pam").is_null().cast(pl.UInt32).mean().alias("mean_MBP_missing"),

            pl.col("total_missing").mean().alias("mean_total_missing")
        ])
        .sort("intervalle")
    ).to_pandas()

# Process data for MIMIC and CHU

missing_means= calculate_missing_means(data)

# Set up the figure with two vertical subplots
fig, axes = plt.subplots(2, 1, figsize=(12, 12), sharex=False)

# Plot for MIMIC dataset
sns.lineplot(data=missing_means, x="intervalle", y="mean_fr_missing", ax=axes[0], label="FR Missing")
sns.lineplot(data=missing_means, x="intervalle", y="mean_heart_rate_missing", ax=axes[0], label="Heart Rate Missing")
sns.lineplot(data=missing_means, x="intervalle", y="mean_spo2_missing", ax=axes[0], label="SpO2 Missing")
sns.lineplot(data=missing_means, x="intervalle", y="mean_MBP_missing", ax=axes[0], label="MBP Missing")
sns.lineplot(data=missing_means, x="intervalle", y="mean_total_missing", ax=axes[0], label="Total Missing", linestyle="--", linewidth=2.5, color="black")
axes[0].set_title(f"Average Missing Data Over Time - {DATASET.upper()} Dataset")
axes[0].set_ylabel("Mean Missing Values")
axes[0].set_xlabel("Time (h)")
axes[0].grid(False)
axes[0].legend()



# Adjust layout and show
plt.tight_layout()
plt.show()


In [None]:
data_features

In [None]:
(data_merge['spo2'] < 80).sum()

In [None]:
data_chu

In [None]:
for cols in data_features:
    # Calcul des statistiques pour chaque dataset
    chu_mean = data_chu[cols].mean()
    chu_std = data_chu[cols].std()
    mimic_mean = data_mimic[cols].mean()
    mimic_std = data_mimic[cols].std()

    # Lignes verticales pour chaque dataset
    chu_line_1 = round(chu_mean - 3 * chu_std, 2)
    chu_line_2 = round(chu_mean + 3 * chu_std, 2)
    mimic_line_1 = round(mimic_mean - 3 * mimic_std, 2)
    mimic_line_2 = round(mimic_mean + 3 * mimic_std, 2)

    # Création des sous-plots côte à côte
    fig, axes = plt.subplots(1, 2, figsize=(20, 10), sharey=True)
    
    # Histogramme pour mimic
    if cols == 'spo2' :
        sns.histplot(data_mimic, x=cols, kde=True, ax=axes[0], color='skyblue', binwidth=((mimic_line_2-mimic_line_1)/10), stat='probability')
    else :
        sns.histplot(data_mimic, x=cols, kde=True, ax=axes[0], color='skyblue', binwidth=((mimic_line_2-mimic_line_1)/20), stat='probability')
    axes[0].axvline(mimic_line_1, color='red', linestyle='--')
    axes[0].axvline(mimic_line_2, color='green', linestyle='--')
    axes[0].text(mimic_line_1, axes[0].get_ylim()[1] * 0.9, f'{mimic_line_1}', color='red')
    axes[0].text(mimic_line_2, axes[0].get_ylim()[1] * 0.9, f'{mimic_line_2}', color='green')
    axes[0].set_title(f'Distribution de {cols} - MIMIC', fontsize=14)
    axes[0].set_xlim(mimic_line_1-mimic_std, mimic_line_2+mimic_std)

    # Histogramme pour chu
    sns.histplot(data_chu, x=cols, kde=True, ax=axes[1], color='orange', binwidth=(chu_line_2-chu_line_1)/20, stat='probability')
    axes[1].axvline(chu_line_1, color='red', linestyle='--')
    axes[1].axvline(chu_line_2, color='green', linestyle='--')
    axes[1].text(chu_line_1, axes[1].get_ylim()[1] * 0.9, f'{chu_line_1}', color='red')
    axes[1].text(chu_line_2, axes[1].get_ylim()[1] * 0.9, f'{chu_line_2}', color='green')
    axes[1].set_title(f'Distribution de {cols} - CHU', fontsize=14)
    axes[0].set_xlim(chu_line_1-chu_std, chu_line_2+chu_std)

    # Ajustement et affichage
    plt.tight_layout()
    plt.show()
    fig = plt.figure()
    fig.savefig(f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/global/features_distributions/{cols}_hist.png')

# Relation données manquantes

In [None]:
data_chu.columns

In [None]:
data_chu_pd = data_chu.select('encounterId',
 'intervalle',
 'heart_rate',
 'spo2',
 'fr',
 'pad',
 'pam',
 'pas').to_pandas()

In [None]:
data_chu_null = data_chu_pd.copy()
data_chu_null[data_features] = data_chu_null[data_features].isnull()