In [None]:
import json
import polars as pl
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pypots.imputation import SAITS
from tqdm import tqdm
import seaborn as sns
import seaborn_qqplot as sqp
import matplotlib.pyplot as plt
from scipy.stats import norm

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION, DATA_FOLD = params['dataset'], params['version'], params['data_folder']

print(f'Working on {DATASET} dataset {VERSION}')

In [None]:
results_folder = f'{DATA_FOLD}/{VERSION}/3.analysis/outliers/{DATASET}'

In [None]:
impute = True

In [None]:
IMPUTED_DATASET = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/tables/first_48_with_static_imputed_saits.parquet'
ORIGINAL_DATASET = f'{DATA_FOLD}/{VERSION}/3.analysis/imputation_48/{DATASET}/first_48h_with_static.parquet'

In [None]:
df_imputed = pl.read_parquet(IMPUTED_DATASET)
df_original = pl.read_parquet(ORIGINAL_DATASET)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(df_imputed.to_numpy()[:,2:])

In [None]:
df_imputed.to_numpy()[:,2:6]

In [None]:
def to_3d_array(df):
    array_df =(
        df
            .select(pl.col('heart_rate', 'spo2', 'fr', 'pam',
       'gender', 'age', 'admission_type'))
            .to_numpy()
    )
    array_df = scaler.transform(array_df)

    array_df = array_df.reshape(-1, 48, 7)

    return array_df

In [None]:
df_imputed_3d = to_3d_array(df_imputed)
df_original_3d = to_3d_array(df_original)

In [None]:
mask = np.isnan(df_original_3d) ^ np.isnan(df_imputed_3d)

In [None]:
if impute:
    n_steps, n_features = df_original_3d.shape[1], df_original_3d.shape[2]

    saits = SAITS(
        n_steps=n_steps, n_features=n_features,
        n_layers=3, d_model=512, d_ffn=128, n_heads=8, d_k=64, d_v=64
        )
    saits.load(path=f'{DATA_FOLD}/{VERSION}/4.models/imputation/mimic/saits/saits_two_days_with_val.pypots')

In [None]:
df_imputed_3d.shape

In [None]:
test_data = df_imputed_3d[:,:,:]

In [None]:
impute = False

In [None]:
if impute:
    errors = np.zeros_like(test_data, dtype=float)
    imputed_values = np.full_like(test_data, fill_value=np.nan, dtype=float)

    num_series, seq_length, num_features = test_data.shape

    for i in tqdm(range(num_series), desc="Processing series"):
        for t in range(seq_length):
            for f in range(num_features):
                if mask[i, t, f] == 0:
                    data_temp = test_data[i].copy()
                    data_temp[t, f] = np.nan
                    data_temp = data_temp.reshape(1, seq_length, num_features)

                    imputed_series = saits.impute({'X': data_temp})
                    imputed_value = imputed_series[0, t, f]

                    errors[i, t, f] = test_data[i, t, f] - imputed_value
                    imputed_values[i, t, f] = imputed_value
                else:
                    errors[i, t, f] = 1
                    imputed_values[i, t, f] = np.nan

    # Déstandardisation
    errors_reshaped = errors.reshape(-1, num_features)
    imputed_reshaped = imputed_values.reshape(-1, num_features)

    errors_reshaped = scaler.inverse_transform(errors_reshaped)
    imputed_reshaped = scaler.inverse_transform(imputed_reshaped)

    # Création des colonnes id et intervalle
    ids = np.repeat(np.arange(num_series), seq_length)
    intervalles = np.tile(np.arange(seq_length), num_series)

    # Construction des DataFrames
    columns = [f'f{i+1}' for i in range(num_features)]
    df_score_sample = pl.DataFrame({
        'id': ids,
        'intervalle': intervalles,
        **{columns[i]: errors_reshaped[:, i] for i in range(num_features)}
    })

    df_imputed_values = pl.DataFrame({
        'id': ids,
        'intervalle': intervalles,
        **{columns[i]: imputed_reshaped[:, i] for i in range(num_features)}
    })

    df_score_sample.write_csv(results_folder + '/datasets/saits_errors.csv')
    df_imputed_values.write_csv(results_folder + '/datasets/saits_full_imputed_values.csv')

    errors_numpy = df_score_sample.select(columns).to_numpy()
    imputed_numpy = df_imputed_values.select(columns).to_numpy()

else:
    df_score_sample = pl.read_csv(results_folder + '/datasets/saits_errors.csv')
    df_imputed_values = pl.read_csv(results_folder + '/datasets/saits_full_imputed_values.csv')

    columns = [col for col in df_score_sample.columns if col not in ('id', 'intervalle')]

    errors_numpy = df_score_sample.select(columns).to_numpy()
    imputed_numpy = df_imputed_values.select(columns).to_numpy()


In [None]:
error_1d = errors_numpy.reshape(-1)

In [None]:
from scipy.stats import skew, kurtosis

print(skew(error_1d))
print(kurtosis(error_1d))


In [None]:
DATASET

In [None]:
data = {'error' : error_1d}
error_pd = pd.DataFrame(data).sort_values(by = 'error').reset_index(drop=True)
error_pd['pos'] = range(len(error_pd))

In [None]:
error_pd

In [None]:

sns.histplot(error_1d, binwidth=0.05)

plt.xlim(-3, 3)
plt.ylim(0,800)

In [None]:
mean_error = error_1d.mean()
std_error = error_1d.std()
error_normalize = (error_1d - mean_error)/std_error

In [None]:
sns.histplot(error_normalize)

plt.xlim(-5, 5)
plt.ylim(0,15000)

In [None]:
import statsmodels.api as sm
import pylab as py

sm.qqplot(error_1d, line ='45') 
py.show() 

In [None]:
df_score_sample = pl.DataFrame(errors.to_numpy().reshape(-1, 6))
df_score_sample.write_csv(results_folder + '/datasets/saits_errors_imputed_mimic.csv')

In [None]:
display_data = scaler.inverse_transform(test_data.reshape(-1, 7))
reshaped_display_data = display_data.reshape(-1, 48, 7)

In [None]:
error_3d = errors_numpy.reshape(-1,48,7)


In [None]:
import math

math.erf(2)

In [None]:
prob_error = norm.sf(np.abs(error_3d))

In [None]:
synthetic_patient = df_imputed_values.to_numpy().reshape(-1,48,7)

In [None]:
alpha = 0.2
exp_error = 1 - np.exp(-alpha * error_3d**2)
#exp_error[mask[:,:,:]] = 1

In [None]:
selected_feature = 3

In [None]:
synth_value

In [None]:
#scores = errors.reshape(-1,48,6)
selected_patient = 883
selected_patient = int(np.random.choice(np.where(np.abs(exp_error[:,:,selected_feature]))[0]))


timestamps = np.arange(48)

import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 8))

patient_data = reshaped_display_data[selected_patient,:,selected_feature]
error_value = exp_error[selected_patient,:,selected_feature]
synth_value = synthetic_patient[selected_patient,:,selected_feature]

# Premier subplot : valeurs du patient
ax1.plot(timestamps, patient_data, marker='o', label='Valeur de la variable')
ax1.set_ylabel('Valeur')
ax1.set_ylim(0, 200)
ax1.grid(True)


# Second subplot : variation du score
ax2.plot(timestamps, synth_value, marker='o', color='red', label='Score')
ax2.set_title('Error')
ax2.set_xlabel('Temps')
ax2.set_ylabel('Erreur')
ax2.set_ylim(0, 300)
ax2.grid(True)



# Ajustement des espacements pour éviter le chevauchement des labels/titres
plt.tight_layout()

fig.suptitle(f"patient {selected_patient}, variable {df_imputed[:,2:].columns[selected_feature]}")
# Affichage du graphique
plt.show()


In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 8))

patient_data = reshaped_display_data[selected_patient,:,selected_feature]
patient_scores = scores[selected_patient,:,selected_feature]

# Premier subplot : valeurs du patient
ax1.plot(timestamps, patient_data, marker='o', label='Valeur de la variable')
ax1.set_ylabel('Valeur')
ax1.grid(True)

# Second subplot : variation du score
ax2.plot(timestamps, patient_scores, marker='o', color='red', label='Score')
ax2.set_title('Variation du score')
ax2.set_xlabel('Temps')
ax2.set_ylabel('Score')
ax2.set_ylim(0, 1)
ax2.grid(True)

# Ajustement des espacements pour éviter le chevauchement des labels/titres
plt.tight_layout()

fig.suptitle(f"patient {selected_patient}, variable {df_imputed[:,2:].columns[selected_feature]}")
# Affichage du graphique
plt.show()
