## Imports

In [None]:
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tableone
import json

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION, DATA_FOLD = params['dataset'], params['version'], params['data_folder']

In [None]:
OUTPUT_DIR = f'{DATA_FOLD}/{VERSION}/3.analysis/demographic/{DATASET}/'
DEMO_DATA = f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/static/clean_static_encounters.parquet'

In [None]:
sns.set_palette('pastel')
sns.set_color_codes(palette='pastel')

In [None]:
df = pl.read_parquet(DEMO_DATA)

In [None]:
VERSION

In [None]:
df.columns

In [None]:
df['unitLabel'].value_counts()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df_pandas = df.to_pandas()

In [None]:
df_pandas.shape

In [None]:
df_pandas

## Valeurs numériques

In [None]:
print(f'Séjours distincts : {df.unique("encounterId").shape[0]}')
print(f'IEP distincts : {df.unique("encounterNumber").shape[0]}')
print(f'IPP distincts : {df.unique("lifeTimeNumber").shape[0]}')

### Age

In [None]:
f, axes = plt.subplots(1,2)
sns.histplot(df_pandas, x='age', binrange=[0,100], binwidth=5, ax=axes[0], hue='isDeceased', multiple='stack')
sns.boxplot(df_pandas, x='age', orient='v', ax=axes[1])

### LOS

In [None]:
f, axes = plt.subplots(1,1)
sns.histplot(df_pandas, x='los', binrange=[-5,200], binwidth=1)
#sns.boxplot(df, x='los', orient='v', ax=axes[1])
plt.xlim(0, 170)

In [None]:
for i in range(19,49,1) :
    los_encounter = df.filter(pl.col('los') < i).shape[0]
    print(f"{los_encounter} inférieurs à {i} ")

### IGS

In [None]:
df_pandas.columns

In [None]:
f, axes = plt.subplots(1,2)
sns.histplot(df_pandas, x='sapsii', binrange=[0,150], binwidth=5, ax=axes[0], hue='isDeceased', multiple='stack')
sns.boxplot(df_pandas, x='sapsii', orient='v', ax=axes[1])
f, axes = plt.subplots(1,2)
sns.histplot(df_pandas, x='sapsii_prob', binrange=[0,1], binwidth=0.05, ax=axes[0], hue='isDeceased', multiple='stack')
sns.boxplot(df_pandas, x='sapsii_prob', orient='v', ax=axes[1])



### TableOne

In [None]:
var_demo = ['age', 'gender', 'los', 'admission_type', 'sapsii', 'isDeceased']
categorical = [ 'gender','admission_type', 'isDeceased']
df_pandas = df.to_pandas()

In [None]:
mytable = tableone.TableOne(df_pandas, var_demo, categorical, pval=False)

In [None]:
DATASET

In [None]:
print(DATASET)
print(mytable.tabulate(tablefmt="latex"))
mytable.to_html(OUTPUT_DIR + 'table/tableOne.html')

In [None]:
if DATASET == 'chu' :
    df_imc = df_pandas[df_pandas['bmi'].between(5,100)]
    print(df_imc.bmi.mean())
    print(df_imc.bmi.std())

## Group By

### Units

In [None]:
sns.countplot(df_pandas, y='unitLabel', hue='isDeceased', dodge=False, orient='h')
plt.xlabel('Nombre de séjours')
plt.ylabel("Service d'admission")
plt.legend(labels=['Survie', 'Décès'])
plt.show()

### Year

In [None]:
sns.countplot(df_pandas, x='year_inTime')
plt.xticks(rotation=45)
plt.xlabel('Année')
plt.ylabel('Nombre de séjours')
plt.show()

## Hosmer-Lemeshow

In [None]:
df_pandas.columns

In [None]:
# Calculer les déciles de la variable predIgs_max
df_pandas['decile'] = pd.qcut(df_pandas['sapsii_prob'], 10, labels=False)

# Calculer les taux de mortalité observés et prédits pour chaque décile
calibration_data = df_pandas.groupby('decile').agg(
    observed_rate=('isDeceased', 'mean'),
    predicted_rate=('sapsii_prob', 'mean')
).reset_index()
calibration_data
# Tracer le graphique de calibration
plt.figure(figsize=(14, 9))

sns.lineplot(data=calibration_data, x='decile', y='predicted_rate', marker='o')
sns.lineplot(data=calibration_data, x='decile', y='observed_rate', marker='x')

plt.plot([0, 10], [0, 1], linestyle='--', color='gray', marker='x', markersize=4)
plt.xlabel('Decile')
plt.xticks(range(10), [f'{i}' for i in range(1, 11)])
plt.ylabel('Mean hospital mortality rate')
plt.legend(labels=['predicted_rate','observed_rate' ])
plt.show()

In [None]:
# Calculer les déciles de la variable sapsii_prob
df_pandas_dropNan = df_pandas.dropna(subset=['sapsii_prob'])
df_pandas_dropNan['decile'] = pd.qcut(df_pandas_dropNan['sapsii_prob'], 10, labels=False)
palette = sns.color_palette("husl", len(df['unitLabel'].unique()))

# Fonction pour tracer le graphique de calibration
def plot_calibration_curve(df, unit=None, linestyle='-', color='blue'):
    if unit:
        df = df[df['unitLabel'] == unit]
    
    calibration_data = df.groupby('decile').agg(
        observed_rate=('isDeceased', 'mean'),
        predicted_rate=('sapsii_prob', 'mean')
    ).reset_index()
    
    plt.plot(calibration_data['predicted_rate'], calibration_data['observed_rate'], marker='o', linestyle=linestyle, color=color, label=f'{unit}' if unit else 'Global')

# Tracer le graphique global
plt.figure(figsize=(10, 6))
plot_calibration_curve(df_pandas_dropNan, linestyle='-', color='blue')

# Tracer les graphiques par unité
for i, unit in enumerate(df_pandas_dropNan['unitLabel'].unique()):
    plot_calibration_curve(df_pandas_dropNan, unit, linestyle='--', color=palette[i])

plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('Décile')
plt.ylabel('Taux de mortalité observé')
plt.title('Graphique de calibration de type Hosmer-Lemeshow')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Calculer les valeurs de la courbe ROC et l'AUC

# Fonction pour tracer la courbe ROC
def plot_roc_curve(df, unit=None, linestyle='-', color='blue'):
    if unit:
        df = df[df['unitLabel'] == unit]
    
    fpr, tpr, _ = roc_curve(df['isDeceased'], df['sapsii_prob'])
    auc = roc_auc_score(df['isDeceased'], df['sapsii_prob'])
    
    plt.plot(fpr, tpr, linestyle=linestyle, color=color, label=f'{unit} AUC = {auc:.2f}' if unit else f'Global AUC = {auc:.2f}')

# Tracer la courbe globale
plt.figure(figsize=(10, 6))
plot_roc_curve(df_pandas_dropNan, linestyle='-', color='blue')

# Tracer les courbes par unité
for i, unit in enumerate(df_pandas_dropNan['unitLabel'].unique()):
    plot_roc_curve(df_pandas_dropNan, unit, linestyle='--', color=palette[i])

plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbe ROC')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

# Calculer la matrice de confusion
conf_matrix = confusion_matrix(df_pandas_dropNan['isDeceased'], df_pandas_dropNan['sapsii_prob'] > 0.5)

# Tracer la matrice de confusion
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Non décédé', 'Décédé'], yticklabels=['Non décédé', 'Décédé'])
plt.xlabel('Prédiction')
plt.ylabel('Réalité')
plt.title('Matrice de Confusion Globale')
plt.show()