# Construct Master Dataset by linking different tables in MIMIC-IV-ED

Note: 

1.MIMIC-ED should be downloaded at “data/ed" folder

2.

3.

## Prepare Python library and raw data

In [18]:
import argparse
from helpers import *
from medcode_utils import commorbidity

In [19]:
## Defining "argparse" parameters
mimic_iv_path = 'C:/Users/XFE/Documents/mimic4ed-benchmark/data/'
#mimic_iv_path = '/Users/jin/Desktop/Temporal_AutoScore/Data/mimic-iv-1.0'
output_path = 'C:/Users/XFE/Documents/mimic4ed-benchmark/data_processed/'
#output_path = '/Users/jin/Desktop/Temporal_AutoScore/Output/mimic-iv-1.0'

In [20]:
## Defining auxillary parameters
## Assume that "ed" directory placed under "mimic_iv_path"
## ZJ: Make this assumption bold
mimic_iv_core_path = os.path.join(mimic_iv_path, 'core')
mimic_iv_hosp_path = os.path.join(mimic_iv_path , 'hosp')   
mimic_iv_icu_path = os.path.join(mimic_iv_path, 'icu')
mimic_iv_ed_path = os.path.join(mimic_iv_path, 'ed')

icu_filename_dict = {"chartevents":"chartevents.csv","datetimeevents":"datetimeevents.csv","d_items":"d_items.csv","icustays":"icustays.csv","inputevents":"inputevents.csv","outputevents":"outputevents.csv","procedureevents":"procedureevents.csv"}
core_filename_dict = {"patients":"patients.csv", "admissions":"admissions.csv", "transfers":"transfers.csv"}
hosp_filename_dict = {"d_hcpcs":"d_hcpcs.csv","d_icd_diagnoses":"d_icd_diagnoses.csv","d_labitems":"d_labitems.csv","emar":"emar.csv","hcpcsevents":"hcpcsevents.csv","microbiologyevents":"microbiologyevents.csv","poe":"poe.csv","prescriptions":"prescriptions.csv","services":"services.csv","diagnoses_icd":"diagnoses_icd.csv","d_icd_procedures":"d_icd_procedures.csv","drgcodes":"drgcodes.csv","emar_detail":"emar_detail.csv","labevents":"labevents.csv","pharmacy":"pharmacy.csv","poe_detail":"poe_detail.csv","procedures_icd":"procedures_icd.csv"}
ed_filename_dict = {'diagnosis':'diagnosis.csv', 'edstays':'edstays.csv',  'medrecon':'medrecon.csv',  'pyxis':'pyxis.csv',  'triage':'triage.csv',  'vitalsign':'vitalsign.csv'}


complaint_dict = {"chiefcom_chest_pain" : "chest pain", "chiefcom_abdominal_pain" : "abdominal pain|abd pain", 
"chiefcom_headache" : "headache|lightheaded", "chiefcom_shortness_of_breath" : "breath", "chiefcom_back_pain" : "back pain", "chiefcom_cough" : "cough", 
"chiefcom_nausea_vomiting" : "nausea|vomit", "chiefcom_fever_chills" : "fever|chill", "chiefcom_syncope" :"syncope", "chiefcom_dizziness" : "dizz"}

## Defining health utilization timerange parameters in days
icu_transfer_timerange = 12 # hours
# past_ed_visits_timerange = 365
# past_admissions_timerange = 365 
# past_icu_visits_timerange = 365
next_ed_visit_timerange = 3



## Load raw data tables through pandas library

In [21]:
## Reading main tables
df_edstays = read_edstays_table(os.path.join(mimic_iv_ed_path, ed_filename_dict['edstays']))
df_patients = read_patients_table(os.path.join(mimic_iv_core_path, core_filename_dict['patients']))
df_admissions = read_admissions_table(os.path.join(mimic_iv_core_path, core_filename_dict["admissions"]))
df_icustays = read_icustays_table(os.path.join(mimic_iv_icu_path, icu_filename_dict['icustays']))
df_triage = read_triage_table(os.path.join(mimic_iv_ed_path, ed_filename_dict['triage']))
df_vitalsign = read_vitalsign_table(os.path.join(mimic_iv_ed_path, ed_filename_dict['vitalsign']))
df_pyxis = read_pyxis_table(os.path.join(mimic_iv_ed_path, ed_filename_dict['pyxis']))
df_medrecon = read_pyxis_table(os.path.join(mimic_iv_ed_path, ed_filename_dict['medrecon']))

## Read data here for ICD.
df_diagnoses = read_diagnoses_table(os.path.join(mimic_iv_hosp_path, hosp_filename_dict['diagnoses_icd']))


## ED root table, demographic and outcomes

In [30]:
## Merging patients -> merging admissions -> merging triage -> master
df_master = merge_edstays_patients_on_subject(df_edstays ,df_patients)
df_master = merge_edstays_admissions_on_subject(df_master ,df_admissions)

In [31]:
## Adding age, mortality and ICU transfer outcome
df_master = add_age(df_master)
df_master = add_inhospital_mortality(df_master)
df_master = add_inhospital_mortality(df_master)
df_master = add_ed_mortality(df_master)
df_master = add_before_ed_mortality(df_master)
df_master = add_ed_los(df_master)
df_master = add_outcome_icu_transfer(df_master, df_icustays, icu_transfer_timerange)
df_master['outcome_hospitalization'] = ~pd.isnull(df_master['hadm_id'])
df_master['outcome_critical'] = df_master['outcome_inhospital_mortality'] | df_master['outcome_icu_transfer_12h']

# Sort Master table for further process
df_master = df_master.sort_values(['subject_id', 'intime']).reset_index()


In [32]:
df_master

Unnamed: 0,index,subject_id,hadm_id,stay_id,intime,outtime,anchor_age,gender,anchor_year,dod,...,age,outcome_inhospital_mortality,ed_death,before_ed_mortality,ed_los,intime_icu,time_to_icu_transfer,outcome_icu_transfer_12h,outcome_critical,outcome_hospitalization
0,338038,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,52,F,2180,NaT,...,52,False,False,False,0 days 04:13:00,NaT,NaT,False,False,True
1,338039,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,52,F,2180,NaT,...,52,False,False,False,0 days 05:37:00,NaT,NaT,False,False,True
2,338040,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,52,F,2180,NaT,...,52,False,False,False,0 days 13:30:00,2180-07-23 14:00:00,0 days 08:06:00,True,True,True
3,338041,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,52,F,2180,NaT,...,52,False,False,False,0 days 08:06:00,2180-07-23 14:00:00,0 days 00:00:00,True,True,True
4,338042,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,52,F,2180,NaT,...,52,False,False,False,0 days 04:46:00,NaT,NaT,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452562,168736,19999784,25715748.0,34149746,2119-08-11 09:15:00,2119-08-11 13:40:00,57,M,2119,NaT,...,57,False,False,False,0 days 04:25:00,NaT,NaT,False,False,True
452563,168737,19999828,29734428.0,30712109,2147-07-17 17:18:00,2147-07-18 17:34:00,46,F,2147,NaT,...,46,False,False,False,1 days 00:16:00,NaT,NaT,False,False,True
452564,168738,19999828,25744818.0,32917002,2149-01-08 09:11:00,2149-01-08 18:12:00,46,F,2147,NaT,...,48,False,False,False,0 days 09:01:00,2149-01-08 18:12:00,0 days 00:00:00,True,True,True
452565,168739,19999914,,32002659,2158-12-24 11:41:00,2158-12-24 11:41:00,49,F,2158,NaT,...,49,False,False,False,0 days 00:00:00,NaT,NaT,False,False,False


## Health Utilization

In [None]:
## Generate past ED visits
df_master = generate_past_ed_visits(df_master, timerange=30)
df_master = generate_past_ed_visits(df_master, timerange=90)
df_master = generate_past_ed_visits(df_master, timerange=365)

In [None]:
## Oucome:  future ED revisit variables
df_master = generate_future_ed_visits(df_master, next_ed_visit_timerange)

In [None]:
## Generate past admissions
df_master = generate_past_admissions(df_master, df_admissions, timerange=30)
df_master = generate_past_admissions(df_master, df_admissions, timerange=90)
df_master = generate_past_admissions(df_master, df_admissions, timerange=365)

In [None]:
## Generate past icu visits
df_master  = generate_past_icu_visits(df_master, df_icustays, timerange=30)
df_master  = generate_past_icu_visits(df_master, df_icustays, timerange=90)
df_master  = generate_past_icu_visits(df_master, df_icustays, timerange=365)

## Triage Information

In [None]:
## Mergining with triage table, Comment: revise the variable names? triage_*
df_master = merge_edstays_triage_on_subject(df_master, df_triage) ## note change to merge master 

In [None]:
## Encoding 10 chief complaints
df_master = encode_chief_complaints(df_master, complaint_dict)

In [None]:
## XF comments:
## Process chief complaints
## other outcomes
## ED revisit/ next revisit time?

## Comorbidities from diagnosis ICD

In [None]:
# This function takes about 10 min
df_master = commorbidity(df_master, df_diagnoses, df_admissions, timerange = 356*5)


## ED Vital signs

In [None]:
df_master = merge_vitalsign_info_on_edstay(df_master, df_vitalsign, options=['last'])

## Medication

In [None]:
df_master = merge_med_count_on_edstay(df_master, df_pyxis)

In [None]:
df_master = merge_medrecon_count_on_edstay(df_master, df_medrecon)

## Review the master dataset and output

In [None]:
#df_master.head(100).to_csv(os.path.join(output_path, 'master_dataset_part.csv'), index=False)
# Full dataset:
df_master.to_csv(os.path.join(output_path, 'master_dataset.csv'), index=False)