# Using an example MEDS tool, ACES for labeling

In [None]:
#@title Install ACES


!pip install es-aces

In [1]:
#@title Download E-ICU demo
import tempfile
import os
from pathlib import Path
notebook_dir = os.getcwd()

# Choose MIMICIV or eicu
ROOT_DIR=f"{notebook_dir}/work_dir/mimiciv_demo/"
# ROOT_DIR=f"{notebook_dir}/work_dir/eicu_demo/"
Path(ROOT_DIR).mkdir(parents=True, exist_ok=True)

!echo {ROOT_DIR}

/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo/


In [60]:
# From the ACES documentation

task_config = """
description: >-
  This file specifies the base configuration for the prediction of a hospital los being greater than 3days,
  leveraging only the first 48 hours of data after admission, with a 24 hour gap between the input window
  and the target window. Patients who die or are discharged in the gap window are excluded. Note that this
  task is in-**hospital** los, not in-**ICU** los which is a different task.

predicates:
  hospital_admission:
    code: {regex: "HOSPITAL_ADMISSION//.*"}
  hospital_discharge:
    code: {regex: "HOSPITAL_DISCHARGE//.*"}
  death:
    code: MEDS_DEATH
  discharge_or_death:
    expr: or(hospital_discharge, death)

trigger: hospital_admission

windows:
  input:
    start: NULL
    end: trigger + 48h
    start_inclusive: True
    end_inclusive: True
    index_timestamp: end
  gap:
    start: input.end
    end: start + 24h
    start_inclusive: False
    end_inclusive: True
    has:
      hospital_admission: (None, 0)
      discharge_or_death: (None, 0)
  target:
    start: trigger
    end: start + 3d
    start_inclusive: False
    end_inclusive: True
    label: discharge_or_death
"""
MEDS_DIR = ROOT_DIR + "/meds"
TASK_DIR = MEDS_DIR + "/task_labels"
! echo TASK_DIR
TASK_NAME = "los_in_hospital_first_48h"
TASK_CONFIG_FP = f"{TASK_DIR}/{TASK_NAME}.yaml"
!mkdir {TASK_DIR}/{TASK_NAME} -p
with open(TASK_CONFIG_FP, 'w') as f:
    f.write(task_config)

TASK_DIR
mkdir: -p: File exists


In [61]:
!aces-cli --multirun data=sharded data.standard=meds data.root={MEDS_DIR}/data data.shard=$(expand_shards  {MEDS_DIR}/data/) cohort_dir={TASK_DIR} cohort_name={TASK_NAME} config_path={TASK_CONFIG_FP}

[2024-12-14 17:02:13,334][HYDRA] Launching 3 jobs locally
[2024-12-14 17:02:13,334][HYDRA] 	#0 : data=sharded data.standard=meds data.root=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/data data.shard=held_out/0 cohort_dir=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels cohort_name=los_in_hospital_first_48h config_path=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml
[32m2024-12-14 17:02:13.542[0m | [1mINFO    [0m | [36maces.__main__[0m:[36mmain[0m:[36m149[0m - [1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/los_in_hospital_first_48h.yaml'[0m
[32m2024-12-14 17:02:13.545[0m | [1mINFO    [0m | [36maces.config[0m:[36mload[0m:[36m1341[0m - [1mParsing windows...[0m
[32m2024-12-14 17:02:13.545[0m | [1mINFO    [0m | [36maces.config[0m:[36mload[0m:[36m1350[0m - [1mParsing tr

In [62]:
import polars as pl

# execute query and get results
df = pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/**/*.parquet")

print("train prevalence: " + str(round(pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/train/*.parquet")['boolean_value'].mean(), 3)))
print("tuning prevalence: " + str(round(pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/tuning/*.parquet")['boolean_value'].mean(), 3)))
print("held_out prevalence: " + str(round(pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/held_out/*.parquet")['boolean_value'].mean(), 3)))


df.sort('boolean_value')

train prevalence: 0.231
tuning prevalence: 0.133
held_out prevalence: 0.25


subject_id,prediction_time,boolean_value,integer_value,float_value,categorical_value
i64,datetime[μs],bool,i64,f64,str
10012853,2175-04-07 15:36:00,false,,,
10012853,2176-11-27 21:28:00,false,,,
10014729,2125-03-01 07:15:00,false,,,
10014729,2125-03-21 16:58:00,false,,,
10016742,2178-07-05 21:13:00,false,,,
…,…,…,…,…,…
10039997,2135-11-09 02:42:00,true,,,
10040025,2143-03-20 12:34:00,true,,,
10040025,2145-07-05 23:46:00,true,,,
10020740,2150-09-17 14:09:00,true,,,


### MEDS-DEV Has tons of pre-defined tasks we can use!!!

In [63]:
TASK_NAME="mortality/in_icu/first_24h"
!../src/MEDS_DEV/helpers/extract_task.sh {MEDS_DIR} "MIMIC-IV" {TASK_NAME}

Running task mortality/in_icu/first_24h on dataset MIMIC-IV with MEDS_ROOT_DIR=/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds and SHARDS=held_out/0,train/0,tuning/0
[2024-12-14 17:02:21,042][HYDRA] Launching 3 jobs locally
[2024-12-14 17:02:21,042][HYDRA] 	#0 : data.shard=held_out/0
[32m2024-12-14 17:02:21.188[0m | [1mINFO    [0m | [36maces.__main__[0m:[36mmain[0m:[36m149[0m - [1mLoading config from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/tasks/criteria/mortality/in_icu/first_24h.yaml'[0m
[32m2024-12-14 17:02:21.190[0m | [1mINFO    [0m | [36maces.__main__[0m:[36mmain[0m:[36m151[0m - [1mOverriding predicates and/or demographics from '/Users/sim/Documents/projects/MEDS-DEV/src/MEDS_DEV/datasets/MIMIC-IV/predicates.yaml'[0m
[32m2024-12-14 17:02:21.207[0m | [1mINFO    [0m | [36maces.config[0m:[36mload[0m:[36m1341[0m - [1mParsing windows...[0m
[32m2024-12-14 17:02:21.207[0m | [1mINFO    [0m | [36maces.config[0m:

In [66]:
!echo "{TASK_DIR}/{TASK_NAME}/**/*.parquet"

/Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//meds/task_labels/mortality/in_icu/first_24h/**/*.parquet


In [67]:
import polars as pl


# execute query and get results
df = pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/**/*.parquet")

print("train prevalence: " + str(round(pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/train/*.parquet")['boolean_value'].mean(), 3)))
print("tuning prevalence: " + str(round(pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/tuning/*.parquet")['boolean_value'].mean(), 3)))
print("held_out prevalence: " + str(round(pl.read_parquet(f"{TASK_DIR}/{TASK_NAME}/held_out/*.parquet")['boolean_value'].mean(), 3)))


df.sort('boolean_value')

train prevalence: 0.133
tuning prevalence: 0.125
held_out prevalence: 0.0


subject_id,prediction_time,boolean_value,integer_value,float_value,categorical_value
i64,datetime[μs],bool,i64,f64,str
10012853,2176-11-27 02:34:49,false,,,
10014729,2125-02-28 10:03:08,false,,,
10016742,2178-07-04 22:45:00,false,,,
10016742,2178-07-14 08:16:00,false,,,
10016742,2178-07-23 08:19:00,false,,,
…,…,…,…,…,…
10010471,2155-12-03 20:33:00,true,,,
10015931,2177-03-25 21:48:07,true,,,
10037861,2117-03-15 16:34:58,true,,,
10037975,2185-01-18 19:12:12,true,,,
