In [1]:
import os
from pathlib import Path
import polars as pl

In [4]:
for env_str in Path(".env").read_text().split():
    var, val = env_str.split("=")
    print(f"Setting {var} to {val}")
    os.environ[var] = val

Setting MIMICIV_RAW_DIR to /n/data1/hms/dbmi/zaklab/MIMIC-IV/raw_files/2.2
Setting MIMICIV_PRE_MEDS_DIR to /n/data1/hms/dbmi/zaklab/MIMIC-IV/MEDS_raw_files/2.2
Setting MIMICIV_MEDS_DIR to /n/data1/hms/dbmi/zaklab/MIMIC-IV/MEDS_compute_tests/4workers_slurm


In [9]:
MEDS_dir = Path(os.environ["MIMICIV_MEDS_DIR"])
MEDS_final_cohort = MEDS_dir / "final_cohort"
shards = [str(fp.relative_to(MEDS_final_cohort)) for fp in MEDS_final_cohort.glob("**/*.parquet")]
train_shards = [s for s in shards if s.startswith("train/")]

In [20]:
%%time
code_df = None
for s in train_shards:
    df = (
        pl.scan_parquet(MEDS_final_cohort / s)
        .drop_nulls(subset="code")
        .group_by("code")
        .agg(pl.col("patient_id").n_unique().alias("n_patients"), pl.len().alias("n_occurrences"))
    )

    if df.select(pl.col("code").is_null().any()).collect().item():
        raise ValueError

    if code_df is None: code_df = df
    else:
        code_df = (
            code_df
            .join(df, suffix="_right", on="code", how="outer")
            .select(
                pl.coalesce("code", "code_right").alias("code"),
                (pl.col("n_patients").fill_null(0) + pl.col("n_patients_right").fill_null(0)).alias("n_patients"),
                (pl.col("n_occurrences").fill_null(0) + pl.col("n_occurrences_right").fill_null(0)).alias("n_occurrences"),
            )
        )

code_df = code_df.collect()

CPU times: user 1min 17s, sys: 14.9 s, total: 1min 32s
Wall time: 15.8 s


In [25]:
code_df = code_df.filter(pl.col("n_patients") > 10).sort("n_occurrences", descending=True)

In [26]:
code_df

code,n_patients,n_occurrences
cat,i64,i64
"""LAB//227969//U…",38278,7048858
"""LAB//220045//b…",40806,5192096
"""LAB//220210//i…",40770,5137587
"""LAB//220277//%…",40775,5082797
"""LAB//220048//U…",40629,4746037
…,…,…
"""PROCEDURE//ICD…",11,11
"""PROCEDURE//ICD…",11,11
"""DIAGNOSIS//ICD…",11,11
"""DIAGNOSIS//ICD…",11,11


In [28]:
code_strs = code_df["code"].to_list()

In [63]:
hosp_admit_codes = [c for c in code_strs if c.startswith("HOSPITAL_ADMISSION//")]
icu_admit_codes = [c for c in code_strs if c.startswith("ICU_ADMISSION//")]
hosp_disch_codes = [c for c in code_strs if c.startswith("HOSPITAL_DISCHARGE//")]
icu_disch_codes = [c for c in code_strs if c.startswith("ICU_DISCHARGE//")]
death_code = "DEATH"

In [55]:
import string

def make_plain_predicate(code: str, i: int) -> str:
    pred_name = f"{code.split('//')[0].lower()}_{i}"
    return "\n".join([f"  {pred_name}:", f"    code: {code}"])
def make_or_predicate(codes: str, pred_name: str) -> str:
    codes_as_preds = [f"{c.split('//')[0].lower()}_{i}" for i, c in enumerate(codes)]
    return "\n".join([f"  {pred_name}:", f"    or({','.join(codes_as_preds)})"])


## Hospital Admission

In [59]:
for i, code in enumerate(hosp_admit_codes):
    print(make_plain_predicate(code, i))
print(make_or_predicate(hosp_admit_codes, "hospital_admission"))

  hospital_admission_0:
    code: HOSPITAL_ADMISSION//EW EMER.//EMERGENCY ROOM
  hospital_admission_1:
    code: HOSPITAL_ADMISSION//EU OBSERVATION//EMERGENCY ROOM
  hospital_admission_2:
    code: HOSPITAL_ADMISSION//SURGICAL SAME DAY ADMISSION//PHYSICIAN REFERRAL
  hospital_admission_3:
    code: HOSPITAL_ADMISSION//OBSERVATION ADMIT//EMERGENCY ROOM
  hospital_admission_4:
    code: HOSPITAL_ADMISSION//URGENT//TRANSFER FROM HOSPITAL
  hospital_admission_5:
    code: HOSPITAL_ADMISSION//URGENT//PHYSICIAN REFERRAL
  hospital_admission_6:
    code: HOSPITAL_ADMISSION//DIRECT EMER.//PHYSICIAN REFERRAL
  hospital_admission_7:
    code: HOSPITAL_ADMISSION//OBSERVATION ADMIT//PHYSICIAN REFERRAL
  hospital_admission_8:
    code: HOSPITAL_ADMISSION//DIRECT OBSERVATION//PHYSICIAN REFERRAL
  hospital_admission_9:
    code: HOSPITAL_ADMISSION//ELECTIVE//PHYSICIAN REFERRAL
  hospital_admission_10:
    code: HOSPITAL_ADMISSION//EU OBSERVATION//PHYSICIAN REFERRAL
  hospital_admission_11:
    code: 

## Hospital Discharge

In [61]:
for i, code in enumerate(hosp_disch_codes):
    print(make_plain_predicate(code, i))
print(make_or_predicate(hosp_disch_codes, "hospital_discharge"))

  hospital_discharge_0:
    code: HOSPITAL_DISCHARGE//HOME
  hospital_discharge_1:
    code: HOSPITAL_DISCHARGE//UNK
  hospital_discharge_2:
    code: HOSPITAL_DISCHARGE//HOME HEALTH CARE
  hospital_discharge_3:
    code: HOSPITAL_DISCHARGE//SKILLED NURSING FACILITY
  hospital_discharge_4:
    code: HOSPITAL_DISCHARGE//REHAB
  hospital_discharge_5:
    code: HOSPITAL_DISCHARGE//DIED
  hospital_discharge_6:
    code: HOSPITAL_DISCHARGE//CHRONIC/LONG TERM ACUTE CARE
  hospital_discharge_7:
    code: HOSPITAL_DISCHARGE//HOSPICE
  hospital_discharge_8:
    code: HOSPITAL_DISCHARGE//AGAINST ADVICE
  hospital_discharge_9:
    code: HOSPITAL_DISCHARGE//PSYCH FACILITY
  hospital_discharge_10:
    code: HOSPITAL_DISCHARGE//ACUTE HOSPITAL
  hospital_discharge_11:
    code: HOSPITAL_DISCHARGE//OTHER FACILITY
  hospital_discharge_12:
    code: HOSPITAL_DISCHARGE//ASSISTED LIVING
  hospital_discharge_13:
    code: HOSPITAL_DISCHARGE//HEALTHCARE FACILITY
  hospital_discharge:
    or(hospital_dischar

## ICU Admission

In [65]:
for i, code in enumerate(icu_admit_codes):
    print(make_plain_predicate(code, i))
print(make_or_predicate(icu_admit_codes, "icu_admission"))

  icu_admission_0:
    code: ICU_ADMISSION//Medical Intensive Care Unit (MICU)
  icu_admission_1:
    code: ICU_ADMISSION//Medical/Surgical Intensive Care Unit (MICU/SICU)
  icu_admission_2:
    code: ICU_ADMISSION//Cardiac Vascular Intensive Care Unit (CVICU)
  icu_admission_3:
    code: ICU_ADMISSION//Surgical Intensive Care Unit (SICU)
  icu_admission_4:
    code: ICU_ADMISSION//Trauma SICU (TSICU)
  icu_admission_5:
    code: ICU_ADMISSION//Coronary Care Unit (CCU)
  icu_admission_6:
    code: ICU_ADMISSION//Neuro Intermediate
  icu_admission_7:
    code: ICU_ADMISSION//Neuro Surgical Intensive Care Unit (Neuro SICU)
  icu_admission_8:
    code: ICU_ADMISSION//Neuro Stepdown
  icu_admission:
    or(icu_admission_0,icu_admission_1,icu_admission_2,icu_admission_3,icu_admission_4,icu_admission_5,icu_admission_6,icu_admission_7,icu_admission_8)


## ICU Discharge

In [64]:
for i, code in enumerate(icu_disch_codes):
    print(make_plain_predicate(code, i))
print(make_or_predicate(icu_disch_codes, "icu_discharge"))

  icu_discharge_0:
    code: ICU_DISCHARGE//Medical Intensive Care Unit (MICU)
  icu_discharge_1:
    code: ICU_DISCHARGE//Medical/Surgical Intensive Care Unit (MICU/SICU)
  icu_discharge_2:
    code: ICU_DISCHARGE//Cardiac Vascular Intensive Care Unit (CVICU)
  icu_discharge_3:
    code: ICU_DISCHARGE//Surgical Intensive Care Unit (SICU)
  icu_discharge_4:
    code: ICU_DISCHARGE//Coronary Care Unit (CCU)
  icu_discharge_5:
    code: ICU_DISCHARGE//Trauma SICU (TSICU)
  icu_discharge_6:
    code: ICU_DISCHARGE//Neuro Intermediate
  icu_discharge_7:
    code: ICU_DISCHARGE//Neuro Stepdown
  icu_discharge_8:
    code: ICU_DISCHARGE//Neuro Surgical Intensive Care Unit (Neuro SICU)
  icu_discharge:
    or(icu_discharge_0,icu_discharge_1,icu_discharge_2,icu_discharge_3,icu_discharge_4,icu_discharge_5,icu_discharge_6,icu_discharge_7,icu_discharge_8)


## Death

In [60]:
print("  death:\n    code: DEATH")

  death:
    code: DEATH
