## Library

In [None]:
import polars as pl
import os
import re
import json

## Params

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version']
DATA_FOLD = params['data_folder']


In [None]:
VERSION

In [None]:
INPUT_FOLDER = f'{DATA_FOLD}/{VERSION}/1.raw_data/{DATASET}/'
OUTPUT_FOLDER = f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/static/'

In [None]:
CENSUS_FILE = 'census/raw_census.parquet'
IGS_FILE = 'igs/igs_all.csv'

OUTPUT_STATIC_FILE = 'clean_static_encounters.parquet'
#_dataset_2024-12-18

## Callable

In [None]:
def clean_percentage(value):
    try:
        clean_value = float(value.replace('Mortalité prédite : ', '').replace('%', '').replace(',', '.')) / 100
        return clean_value
    except Exception:
        return None

def clean_encounter(value):
    try:
        remove_letters = re.sub('\D', '', value)
        clean_value = str(int(value))
        return remove_letters
    except:
        return None

def clean_igs(value):
    try:
        clean_value = float(value)
        return clean_value
    except:
        return None


## Import dataset

In [None]:
raw_data = pl.read_parquet(INPUT_FOLDER + CENSUS_FILE)
df_igs = pl.read_csv(INPUT_FOLDER + IGS_FILE)

In [None]:
raw_data.head()

raw_data.filter(pl.col('displaylabel').is_in([
                'Neuro Surgical Intensive Care Unit (Neuro SICU)',
                'Surgical Intensive Care Unit (SICU)',
                'Neuro Stepdown',
                'Medical/Surgical Intensive Care Unit (MICU/SICU)',
                'Cardiac Vascular Intensive Care Unit (CVICU)', #retrait chirurgie CCV pour correspondre au dataset CHU
                'Neuro Intermediate',
                'Coronary Care Unit (CCU)',
                'Medical Intensive Care Unit (MICU)',
                'Trauma SICU (TSICU)'
                ])).unique('encounterid').shape

In [None]:
if DATASET == 'mimic' :
    raw_data = raw_data.rename(
                    {
                        'displaylabel': 'displayLabel',
                        'encounterid': 'encounterId',
                        'encounternumber': 'encounterNumber',
                        'lifetimenumber': 'lifeTimeNumber',
                        'dateofdeath' : 'dateOfDeath',
                        'isdeceased' : 'isDeceased',
                        'utcintime' : 'utcInTime',
                        'utcouttime' : 'utcOutTime',
                        'lengthofstay' : 'lengthOfStay',
                        'height' : 'taille',
                        'weight' : 'poids_admission'
                    }
                ).with_columns(
                    [
                        pl.lit(None).alias('adresse'),
                        pl.lit(None).alias('ville'),
                        pl.lit(None).alias('cp'),
                        pl.lit(None).alias('dateOfBirth'),
                        pl.lit(None).alias('lastName'),
                        pl.lit(None).alias('firstName')
                    ]
                )

if DATASET == 'chu' :
    raw_data = raw_data.with_columns(
                    [
                        pl.lit(None).alias('taille'),
                        pl.lit(None).alias('poids_admission'),
                    ]
    )

In [None]:
raw_data['displayLabel'].unique().to_list()

## Data cleaning

### Separate mortality 

In [None]:
df_mortality = raw_data.select('encounterId', 'dateOfDeath','isDeceased'
                ).filter((pl.col('isDeceased') == True) | (pl.col('dateOfDeath').is_not_null())
                         ).group_by('encounterId').agg(
                             pl.col('isDeceased').max().alias('isDeceased'),
                                pl.col('dateOfDeath').first().alias('dateOfDeath')
                ).cast(
                    {'encounterId' : pl.String}
                )

In [None]:
df_mortality

### Regroup by Encounter

In [None]:
# Display units list
if DATASET == 'chu' :
    icu_units = [ 
                'PURPAN REA. POLY.',
                'IUC REA.', 
                'NEURO-CHIR REA', 
                'RANGUEIL REA. POLY.',
                'RANGUEIL DECHO. REA.', 
                'PURPAN DECHO. REA.', 
                'PURPAN SC. REA.', 
                'RANGUEIL SC. REA.',
                'IUC SC.'
                ]
    
elif DATASET == 'mimic' :
    icu_units = [
                'Neuro Surgical Intensive Care Unit (Neuro SICU)',
                'Surgical Intensive Care Unit (SICU)',
                'Neuro Stepdown',
                'Medical/Surgical Intensive Care Unit (MICU/SICU)',
                #'Cardiac Vascular Intensive Care Unit (CVICU)', retrait chirurgie CCV pour correspondre au dataset CHU
                'Neuro Intermediate',
                'Coronary Care Unit (CCU)',
                'Medical Intensive Care Unit (MICU)',
                'Trauma SICU (TSICU)'
                ]

In [None]:
encounter_df = (raw_data
                .filter(
    # Keeping only ICU (without UTO, paediatric ICU, CCV and Burns)
                    pl.col('displayLabel').is_in(icu_units),
                    (pl.col('age') >= 18)
    # Regroup by encounter and unit
                    )
                .with_columns(
                    #pl.col('encounterId').cast(pl.String).alias('encounterId'),
                    pl.col('encounterId').cast(pl.String).map_elements(clean_encounter, return_dtype=pl.String).alias('encounterId')
                    )
                .sort(by=['encounterId', 'utcInTime'])
                .group_by(
                    ['encounterId',
                    'encounterNumber',
                    'lifeTimeNumber',
                    'lastName',
                    'firstName',
                    'gender',
                    'age',
                    'dateOfBirth']
                )
                .agg([
                    pl.col('utcInTime').min().alias('utcInTime'),
                    pl.col('utcOutTime').max().alias('utcOutTime'),
                    pl.col("displayLabel")
                    .filter((pl.col("displayLabel").is_not_null()))
                    .sort_by('utcInTime')
                    .first()
                    .alias('unitLabel'),
                    pl.col('adresse').max().alias('adresse'),
                    pl.col('ville').max().alias('ville'),
                    pl.col('cp').max().alias('cp'),
                    pl.col('taille').first().alias('taille'),
                    pl.col('poids_admission').first().alias('poids_admission'),
                ])
                .join(
                    df_mortality, on='encounterId', how='left'
                ).with_columns(
                    [
                        ((((pl.col('utcOutTime').sub(pl.col('utcInTime'))).dt.total_minutes())/60)).round(2).alias('los')
                    ]
                ).with_columns(
                    pl.when(pl.col('isDeceased').is_not_null() | pl.col("dateOfDeath").is_not_null())
                    .then(pl.lit(True))
                    .otherwise(pl.lit(False))
                    .alias("isDeceased")
                )
            )

In [None]:
encounter_df

## Extended demography

### Import datasets

In [None]:
df_demo_extended = encounter_df
if DATASET == 'chu': 
    df_demo_extended = df_demo_extended.drop(['taille', 'poids_admission'])
    directory = INPUT_FOLDER + 'extended_demography/'
    for filename in os.listdir(directory):
        if filename.endswith(".parquet") or filename.endswith(".csv"): 
            df_extended_feature = pl.read_parquet(directory + filename)
            print(df_extended_feature.shape)
            feature = df_extended_feature.get_column('feature').to_list()[0]
            print(feature)
            df_extended_feature = df_extended_feature.sort(
                            'encounterId', 'utcChartTime'
                        ).unique(
                            subset=['encounterId'], keep='last'
                        )
            if feature in ['taille', 'poids_admission'] :
                df_extended_feature = df_extended_feature.rename(
                        {'valueNumber' : feature}
                )
            else:
                df_extended_feature = df_extended_feature.rename(
                        {'valueString' : feature}
                )
            df_extended_feature = df_extended_feature.select('encounterId', feature)
            df_demo_extended = df_demo_extended.join(
                            df_extended_feature, on='encounterId', how='left'
                        )

In [None]:
df_demo_extended.head(3)

## IGS data

In [None]:
if DATASET == 'mimic' :
    df_igs_clean = (
            df_igs  
            .rename(
                {
                    'encounterid': 'encounterId'
                    ,'admissiontype_score' : 'admission_type'
                }
            )
            .sort('encounterId', 'sapsii', descending=[False, True])
            .unique('encounterId', keep='first')
            .with_columns(
                pl.when(pl.col('admission_type') == 8)
                    .then(pl.lit('Unscheduled Surgery'))
                .when(pl.col('admission_type') == 0)
                    .then(pl.lit('Scheduled Surgery'))
                .when(pl.col('admission_type') == 6)
                    .then(pl.lit('Medical'))
                .otherwise(None).alias('admission_type')
                )
            .cast({'encounterId': pl.String})
            .select('encounterId', 'admission_type', 'sapsii', 'sapsii_prob')
            )
    df_demo_extended = df_demo_extended.join(df_igs_clean, on='encounterId', how='left')

In [None]:
if DATASET == 'chu' : 

    df_igs_clean = (
                df_igs  
                .with_columns(
                        pl.col('encounterNumber').map_elements(clean_encounter, return_dtype=pl.String).alias('encounterNumber'),
                        pl.col('igsMort').map_elements(clean_percentage, return_dtype=pl.Float64).alias('sapsii_prob')
                        )
                .sort('encounterNumber', 'igsStoreTime')
                .unique('encounterNumber', keep='first')
                .with_columns(
                    pl.when(pl.col('igsTypeAdm') == 0)
                        .then(pl.lit('Medical'))
                    .when(pl.col('igsTypeAdm') == 2)
                        .then(pl.lit('Unscheduled Surgery'))
                    .when(pl.col('igsTypeAdm') == 1)
                        .then(pl.lit('Scheduled Surgery'))
                    .otherwise(None).alias('admission_type')
                    )
                .rename(
                    {
                    'igsTotal' : 'sapsii'
                    }
                )
                .select('encounterNumber', 'admission_type', 'sapsii', 'sapsii_prob')
                )
    df_demo_extended = df_demo_extended.join(df_igs_clean, on='encounterNumber', how='left')

In [None]:
df_igs_clean

In [None]:
df_demo_extended.n_unique('encounterId')

### Admission type (from IGS)

## Pseudonymisation

In [None]:
col_identifiantes = [
    'encounterId',
    'encounterNumber',
    'lifeTimeNumber',
    'lastName',
    'firstName',
    'dateOfBirth',
    'cp',
    'ville',
    'adresse',
    'utcInTime',
    'utcOutTime',
    'dateOfDeath',
    'conclusion',
    'motif_adm'
]

In [None]:
if DATASET == 'chu' : 
    df_indexed = df_demo_extended.with_row_index(offset=1)
    table_corr = df_indexed.select(col_identifiantes)
    df_pseudonymised = df_indexed.with_columns(
        year_inTime = pl.col('utcInTime').dt.year()
    ).select(pl.exclude(col_identifiantes))


## Save dataset

In [None]:
df_demo_extended.write_parquet(OUTPUT_FOLDER + OUTPUT_STATIC_FILE)

In [None]:
if DATASET == 'chu' : 
    table_corr.write_csv(OUTPUT_FOLDER + 'correlation_table.csv')
    df_pseudonymised.write_parquet(OUTPUT_FOLDER + 'clean_pseudonimysed_dataset.parquet')

In [None]:
df_demo_extended