In [None]:
import polars as pl
import json
import time
import os

QUERIES_FOLDER = "dynamic_features/"
OUTPUT_FOLDER = '../2.rawDataset/dynamic_features/'
SERVER = 'SVM-ICCA-REP'
DATABASE = 'CISReportingDB'

In [None]:
# Chargement des paramètres du projet
with open('../../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version'] # Nom du dataset et de la version
DATA_FOLD = params['data_folder'] # Localisation des données (entrée et sortie)

In [None]:
DEMO_DATA = f'{DATA_FOLD}/{VERSION}/2.clean_data/{DATASET}/static/clean_static_encounters.parquet'
encounters = pl.read_parquet(DEMO_DATA).select("encounterId", "utcInTime")

In [None]:
conn_uri = f'mssql://@{SERVER}/{DATABASE}?trusted_connection=true'

query = open(QUERIES_FOLDER + 'generic_query.sql').read()

In [None]:
with open('dynamic_features/' + 'features.json', 'r', encoding='utf-8') as f :
    variables_json = json.load(f)

## Génération des datasets

### Automatisée via JSON

In [None]:
print(list(variables_json.keys()))

In [None]:
variables_list = ['pplat', 'tidal_volume_in', 'tidal_volume_exp', 'peak_pressure', 'mode_ventilation', 'admin_o2', 'o2_flow', 'installation']
variable_extraction = {key: variables_json[key] for key in variables_list} # {key: variables_json[key] for key in variables_list} / list(variables_json.keys())


In [None]:
variable_extraction

In [None]:
start = time.time()
for variable in variables_list :
    print(f'---------Extraction {variable} : START ---------')

    loop_time = time.time()

    # Récupération des paramètres pour l'extraction depuis le json
    propname = variables_json[variable]['dictionary_propname']
    table = variables_json[variable]['table']
    feature = variables_json[variable]

    # Implémentation des paramètres dans la requête SQL
    query_formatted = query.format(
        dictionaryPropName = propname,
        feature = f"'{variable}'",
        feature_table = table
    )

    try :

        # Extraction des données et stockage dans un dataframe polars
        feature_data = pl.read_database_uri(query_formatted, conn_uri, engine='connectorx')
        print(f'Extraction {variable} : OK')

        # Récupération uniquement des encounterId contenu dans le dataset 'statique'
        with_time_delta = feature_data.with_columns(
            pl.col('encounterId').cast(pl.String)
        ).join(
            encounters, on='encounterId', how="inner"
        ).with_columns(
            (((pl.col('utcChartTime') - pl.col('utcInTime')).dt.total_minutes())/60).alias('delta_inTime_hours')
        ).sort('encounterId', 'utcChartTime')
        print(f"Nombre d'encounters distincts : {with_time_delta.unique('encounterId').shape[0]}/{encounters.shape[0]}")
        try :
            # Sauvegarde du dataset sous un format .parquet
            with_time_delta.write_parquet(f'../2.rawDataset/dynamic_features/{variable}.parquet')
            print(f'Sauvegarde {variable} : OK (shape : {with_time_delta.shape})')
        except Exception as e:
            print(e)
            continue
        
    except Exception as e:
        print(e)
        print(query)
        continue
    
    end_loop_time = time.time()    
    print(f'Elapsed time for {variable} : {round((end_loop_time - loop_time)/60, 0)}min (total exec : {round((end_loop_time-start)/60, 0)}min)')
    print(f'---------Extraction {variable} : END ---------')

In [None]:
missing_encounters_folder = os.path.join(OUTPUT_FOLDER, 'missing_encounters')
for filename in os.listdir(OUTPUT_FOLDER):
    
    if filename.endswith(".parquet"):
        print(f'--------{filename}----------')
        # Charger le dataset .parquet
        feature_data = pl.read_parquet(os.path.join(OUTPUT_FOLDER, filename))
        
        # Récupérer les encounterId du dataset .parquet
        feature_encounters = feature_data.select("encounterId").unique()
        
        # Trouver les lignes du dataset encounters n'apparaissant pas dans la feature
        missing_encounters = encounters.join(feature_encounters, on="encounterId", how="anti")
        
        # Afficher le résultat
        print(f"Encounters manquants: {missing_encounters.shape[0]}/{encounters.unique('encounterId').shape[0]}")
        missing_filename = os.path.join(missing_encounters_folder, f"missing_{filename}")
        missing_encounters.write_parquet(missing_filename)