# init

In [None]:
import numpy as np
import pandas as pd
# import polars as pl
import os, sys
print(os.getcwd())
os.chdir('/gpfs/data/healthcare-allocate/CLIF-MIMIC/code')
print(os.getcwd())

proj_root = "/gpfs/data/healthcare-allocate/CLIF-MIMIC"
if proj_root not in sys.path:
    sys.path.append(proj_root)

from code.custom_utils import *

: 

## load tables

In [None]:
def load_mimic_table(module: {"icu", "hosp"}, table, file_type: {"csv", "parquet", "pq"} = "csv"):
    if file_type in ["pq", "parquet"]:
        return pd.read_parquet(f'../mimic-iv-2.2/{module}/{table}.parquet')
    elif file_type == "csv":
        return pd.read_csv(f'../mimic-iv-2.2/{module}/{table}.csv.gz')

In [None]:
patients = load_mimic_table("hosp", "patients") # gives gender
admissions = load_mimic_table("hosp", "admissions") # gives race and ethnicity

In [None]:
d_items = load_mimic_table("icu", "d_items", "csv")
chartevents = load_mimic_table("icu", "chartevents", "parquet")

In [None]:
procedureevents = load_mimic_table("icu", "procedureevents", "csv")
datetimeevents = load_mimic_table("icu", "datetimeevents", "csv")

In [None]:
inputevents = load_mimic_table("icu", "inputevents", "csv")
outputevents = load_mimic_table("icu", "outputevents", "csv")

In [None]:
# def resave_mimic_table_to_parquet(table: pd.DataFrame):
    # if not yet in memory, load it:
    # if not table:

In [None]:
# labevents = load_mimic_table("hosp", "labevents", "csv")
# labevents.to_parquet("../mimic-iv-2.2/hosp/labevents.parquet")
d_labitems = load_mimic_table("hosp", "d_labitems", "csv")
labevents = load_mimic_table("hosp", "labevents", "parquet")

poe = load_mimic_table("hosp", "poe", "csv")
poe_detail = load_mimic_table("hosp", "poe_detail", "csv")

transfers = load_mimic_table("hosp", "transfers", "csv")
icustays = load_mimic_table("icu", "icustays", "csv")

## load mappings

In [None]:
def load_mapping_csv(csv_name: str):
    return pd.read_csv(f"../mapping/mimic-to-clif-mappings - {csv_name}.csv")

In [None]:
vitals_mapping = load_mapping_csv("vitals")
resp_mapping = load_mapping_csv("respiratory_support")
resp_device_mapping = load_mapping_csv("device_category")
resp_mode_mapping = load_mapping_csv("mode_category")

In [None]:
# covert to a dict for df col renaming later
def construct_mapper_dict(mapping_df: pd.DataFrame, key_col: str, value_col: str, none = False):
    mapper_dict = dict(zip(mapping_df[key_col], mapping_df[value_col]))
    # to enable a None -> None mapping
    if none:
        mapper_dict[None] = None
    return mapper_dict

# vitals table
vital_name_mapper_dict = construct_mapper_dict(vitals_mapping, "itemid", "label = vital_name")
vital_category_mapper_dict = construct_mapper_dict(vitals_mapping, "itemid", "vital_category")
vital_site_mapper_dict = construct_mapper_dict(vitals_mapping, "itemid", "meas_site_name")

# resp support table
resp_mapper_dict = construct_mapper_dict(resp_mapping, "itemid", "variable")
resp_device_mapper_dict = construct_mapper_dict(resp_device_mapping, "device_name", "device_category")
resp_mode_mapper_dict = construct_mapper_dict(resp_mode_mapping, "mode_name", "mode_category")


# utils

## `CacheInfo` class

In [None]:
class CacheInfo:
    """
    CacheInfo object used to represent the current status of `lru_cache`
    """
    def __init__(self, max_size):
        self.max_size = max_size
        self.misses = 0
        self.hits = 0
        self.cur_size = 0
        # NOTE: you may add to this if you want, but do not modify the lines above
        # create an attribute in the CacheInfo class to store the cache dict
        self.cache_dict = {}
    
    def __repr__(self):
        return f"CacheInfo(hits={self.hits}, misses={self.misses}, max_size={self.max_size}, cur_size={self.cur_size})"

# first layer: a decorator factory
def lru_cache(max_size = 128):
    '''
    This function is a decorator factory that returns a decorator with a user-specified
    maximum size of the cache 

    Input:
        - max_size: the maximum size of the cache
    
    Output: 
        - a decorator
    '''
    # second layer: the decorator
    def decorator(func):
        '''
        This function is a decorator that takes in an original function and
        return a new, decorated function.

        Input: an original function

        Output: a new function
        '''
        # initialize an instance of the CacheInfo class
        cache_info = CacheInfo(max_size = max_size)
        def key_generator(*args, **kwargs):
            '''
            This helper function creates a unique key given every different 
            combination of positional and key-word arguments

            Input: 
                - *args: any position arguments
                - **kwargs: any key word arguments

            Output: 
                - a tuple that stores all the arguments and their data type
            '''
            # generates a tuple that stores the data type of each position arg
            args_type = tuple(map(lambda x: type(x), args))
            # kwargs is a dict, so we use items() to turn it into a seq of
            # key-value tuples, and add the data type of the key word arg
            # to the tuple, then use frozenset() to make it immutable and 
            # thus hashable
            kwargs_and_type = frozenset(
                       map(lambda tup: (tup, type(tup[1])), 
                           kwargs.items()))
            return (args, args_type, kwargs_and_type)
        def new_func(*args, **kwargs):
            '''
            This is the new function that replaces the original function.

            Input:
                - *args, **kwargs: any position and key word arguments

            Output:
                - the result of the new function
            '''
            key = key_generator(*args, **kwargs)        
            # if the key is already in the cache dict, i.e. the same args have
            # been provided before, there should be "memory" in the cache
            # we got a hit
            if key in cache_info.cache_dict:
                cache_info.hits += 1 
                # temporarily store the cache result first before we remove the key
                cached_result = cache_info.cache_dict[key]
                # remove the key
                cache_info.cache_dict.pop(key)
                # insert the same key to the tail of the dict
                cache_info.cache_dict[key] = cached_result
                return cached_result
            # when we have a new arg combination that is not seen before 
            # -- we have a "miss":
            else:
                cache_info.misses += 1  
                # add the output of the function to the dict
                cache_info.cache_dict[key] = func(*args, **kwargs)  
                # update cache size (length of the dict) 
                cache_info.cur_size = len(cache_info.cache_dict) 
                # if the cache exceeds the maximum size, remove the least recently used item
                if len(cache_info.cache_dict) > max_size:
                    # first covert the dict to a list so we can track the order
                    cache_list = list(cache_info.cache_dict.items())
                    # remove the first element in the list, which is the least 
                    # recently used item
                    cache_list.pop(0)
                    # convert the list back to a dict and update
                    cache_info.cache_dict = dict(cache_list)
                    # update the cache size again, which should = max_size
                    cache_info.cur_size = len(cache_info.cache_dict)  
                return cache_info.cache_dict[key]
        # update the attribute of the now-decorated new func
        new_func.cache_info = cache_info
        return new_func     
    return decorator

In [None]:
# FIXME: delete "ALREDAY MAPPED" at some pt
EXCLUDED_LABELS_DEFAULT = ["NO MAPPING", "UNSURE", "MAPPED ELSEWHERE", "SPECIAL CASE", "ALREADY MAPPED"] 

# find all the relevant item ids for a table
def get_relevant_item_ids(
    mapping_df: pd.DataFrame, decision_col: str, excluded_labels: list = EXCLUDED_LABELS_DEFAULT):
    '''
    decision_col:
    '''
    return mapping_df.loc[
        ~mapping_df[decision_col].isin(excluded_labels),
        "itemid"
        ].unique()

def rename_and_reorder_cols(df, rename_mapper_dict: dict, new_col_order: list):
    return (
        df.rename(columns = rename_mapper_dict)
        .reindex(columns = new_col_order)
        )

def find_duplicates(df, cols: list[str] = ["hadm_id", "time", "itemid"]):
    '''
    Check whether there are duplicates -- more than one populated value -- for what is supposed to be 
    unique combination of columns. That is, for the same measured variable (e.g. vital_category) at
    the same time during the same encounter, there should be only one corresponding value.
    '''
    return df[df.duplicated(subset = cols, keep = False)]

def check_duplicates(df, additional_cols: list):
    '''
    Check whether there are duplicates -- more than one populated value -- for what is supposed to be 
    unique combination of columns. That is, for the same measured variable (e.g. vital_category) at
    the same time during the same encounter, there should be only one corresponding value.
    '''
    cols_to_check = ["encounter_id", "recorded_dttm"].extend(additional_cols)
    return df[df.duplicated(subset = cols_to_check, keep = False)]

In [None]:
# @lru_cache()
def item_id_to_feature_value(item_id: int, col: str = "label", df = d_items):
    '''
    Find the corresponding feature value of an item by id.
    i.e. find the label, or linksto, or item with id 226732.
    '''
    print(f"searching for the {col} of item {item_id}")
    return df.loc[df["itemid"] == item_id, col].values[0]

# @lru_cache()
def item_id_to_label(item_id: int) -> str:
    '''
    Helper function that returns the "label" string of an item given its item_id. 
    '''
    return item_id_to_feature_value(item_id)

def item_id_to_events_df(item_id: int, simplify: bool = False) -> pd.DataFrame:
    '''
    Return in a pandas df all the events associated with an item id.
    - simplify: whether to return the original df (False), or a simplified one 
    with some columns (particulary timestamps) renamed to support integration 
    between different events df.  
    '''
    # find whether it is chartevents, or procedure events, etc.
    linksto_table_name = item_id_to_feature_value(item_id, col = "linksto")
    # turn string into a dj object
    linksto_df: pd.DataFrame = globals()[linksto_table_name]
    events_df = linksto_df.loc[linksto_df["itemid"] == item_id, :]
    # if does not simplify, return the original column
    if not simplify:
        return events_df
    # else, if simplified:
    elif linksto_table_name == "procedureevents": # FIXME: trach is complex and need additional attention
        events_df_simplified = events_df.loc[
            :, ['subject_id', 'hadm_id', 'stay_id', 'endtime', 'itemid', 'value', 'valueuom']
        ].rename(columns = {"endtime": "time"})
        return events_df_simplified
    elif linksto_table_name == "chartevents":
        events_df_simplified = events_df.loc[
            :, ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value', 'valueuom']
        ].rename(columns = {"charttime": "time"})
        return events_df_simplified
        
    # FIXME: likely an issue if data struct of different events table are different 

def item_ids_list_to_events_df(item_ids: list):
    df_list = [item_id_to_events_df(item_id, simplify = True) for item_id in item_ids]
    df_merged = pd.concat(df_list) #.head().assign(
        ## linksto = lambda df: df["itemid"].apply(lambda item_id: item_id_to_feature_value(item_id, col = "linksto"))
    # )
    return df_merged 
    # FIXME: automatically add the label and linksto table source columns -- create cache?

## `ItemFinder` class

In [None]:
class ItemFinder():
    def __init__(self, kw = None, df = d_items, col: str = "label", 
                 case: bool = False):
        '''
        Look up an item by keyword from the `d_items` table of the `icu` module.
        - df = `d_items`
        - col = {"label", "abbr"}
        '''
        self.kw = kw 
        self.df = df
        self.col = "abbreviation" if col == "abbr" else col
        self.case = case

        # df of items that match the key words
        self.kw_items_df = df[
            df[self.col].str.contains(kw, case = case, na = False)
        ]
        # list of ids for items that match the key words
        self.kw_items_ids = self.kw_items_df["itemid"].values
        # a string list of non-duplicated events table names, e.g. ["chartevents", "procedureevents"]
        self.linksto_table_names = self.kw_items_df["linksto"].unique()
                                                                
        self.kw_chartevents = chartevents.loc[
            chartevents["itemid"].isin(self.kw_items_ids),
            :
        ]
        self.item_freq = self.generate_item_freq()

        self.candidate_table = self.make_candidate_table()

    def generate_item_freq(self):
        '''
        Iterative over each events table, find the items freq therein, and combine into one df.
        '''
        freq_df_ls = [] # a list of df's
        for table_name in self.linksto_table_names:
            # fetch the object by name, i.e. chartevents, procedureevents, etc.
            events_df = globals()[table_name]
            # a df of events associated with the key word items
            kw_events_df = events_df.loc[
                events_df["itemid"].isin(self.kw_items_ids),
                :
            ]
            # a df of item freq for one event type  
            item_freq_df = kw_events_df.value_counts("itemid")

            # check if the df is empty
            if not item_freq_df.empty:
                freq_df_ls.append(item_freq_df)
        
        # check if the list len is 0, meaning 
        if len(freq_df_ls) != 0:
            # return a df of all the item freq of all events table concat-ed together
            return pd.concat(freq_df_ls)
        else: 
            return self.kw_chartevents.value_counts("itemid")

    def make_candidate_table(self):
        '''
        # TODO
        '''
        if not self.item_freq.empty:
            return (
                self.kw_items_df
                .loc[:, ["itemid", "label", "abbreviation", "linksto", "category", "unitname", "param_type"]]
                # FIXME
                .join(self.item_freq, on = "itemid", validate = "1:1")
                .sort_values(by = "count", ascending = False) 
                .assign(
                    value_instances = lambda x: x["itemid"].apply(item_id_to_value_instances)
                )
            )
        else: 
            return "No matching result found."

@lru_cache()
def item_id_to_value_instances(item_id: int):
    '''
    Wrapper
    '''
    label = item_id_to_feature_value(item_id, "label")

    param_type = item_id_to_feature_value(item_id, "param_type")
    
    if param_type == "Numeric":
        val_instances = item_id_to_value_instances_numeric(item_id)
    elif param_type == "Text":
        val_instances = item_id_to_value_instances_categorical(item_id).to_dict()
    else:
        return param_type
    print(f"item label: {label}; value instances: {str(val_instances)}")
    return str(val_instances)

def item_id_to_value_instances_categorical(item_id: int, events: pd.DataFrame = chartevents):
    '''
    Return all the unique categories
    '''
    assoc_events = events.loc[events["itemid"] == item_id, :]
    categories: pd.Series = assoc_events.value_counts("value") 
    return categories
    
def item_id_to_value_instances_numeric(item_id: int, events: pd.DataFrame = chartevents):
    '''
    Find max, min, mean of a continuous, or numeric, item.
    '''
    valuenum_col = events.loc[events["itemid"] == item_id, :]["valuenum"]
    val_max, val_min, val_mean = valuenum_col.max(), valuenum_col.min(), round(valuenum_col.mean(), 2)
    return f"Max: {val_max}, Min: {val_min}, Mean: {val_mean}"

NameError: name 'chartevents' is not defined

# `patient_encounters` table

In [None]:
# Since each unique hospital visit for a patient is assigned a unique hadm_id, the admissions table can be considered as a definition table for hadm_id.


# `patient_demographics` table

## dev

In [5]:
# multiple race for one patient
race_counts = admissions.groupby('subject_id')['race'].nunique()
multi_race_indices = race_counts[race_counts > 1].index
multi_race_encounters = admissions[
    admissions['subject_id'].isin(multi_race_indices)
    ][["subject_id", "hadm_id", "race", "admittime", "admission_type", "admission_location"]]
multi_race_encounters

Unnamed: 0,subject_id,hadm_id,race,admittime,admission_type,admission_location
87,10002013,21763296,WHITE,2165-11-23 08:19:00,DIRECT EMER.,CLINIC REFERRAL
88,10002013,21975601,WHITE,2159-12-14 23:55:00,EW EMER.,EMERGENCY ROOM
89,10002013,23581541,OTHER,2160-05-18 07:45:00,SURGICAL SAME DAY ADMISSION,PHYSICIAN REFERRAL
90,10002013,23745275,WHITE,2157-10-31 12:54:00,AMBULATORY OBSERVATION,PROCEDURE SITE
91,10002013,24760295,OTHER,2160-07-10 19:33:00,EW EMER.,EMERGENCY ROOM
...,...,...,...,...,...,...
431136,19997911,27144120,WHITE - OTHER EUROPEAN,2196-09-16 23:53:00,OBSERVATION ADMIT,EMERGENCY ROOM
431137,19997911,27402845,WHITE,2193-09-02 10:44:00,EU OBSERVATION,EMERGENCY ROOM
431190,19999287,20175828,BLACK/AFRICAN AMERICAN,2197-08-03 20:58:00,EW EMER.,EMERGENCY ROOM
431191,19999287,22997012,BLACK/AFRICAN AMERICAN,2197-07-26 03:29:00,EW EMER.,EMERGENCY ROOM


In [6]:
# but only one race per encounter: 
race_counts = admissions.groupby('hadm_id')['race'].nunique()
race_counts[race_counts > 1].index

Index([], dtype='int64', name='hadm_id')

In [9]:
# check for South Americans
south_american_subject_ids = admissions.loc[admissions["race"] == "SOUTH AMERICAN", "subject_id"].unique()
sa_race_counts = (
    admissions[admissions["subject_id"].isin(south_american_subject_ids)]
    .groupby('subject_id')['race'].nunique()
)
multi_race_indices = sa_race_counts[sa_race_counts > 1].index
multi_race_encounters = admissions[
    admissions['subject_id'].isin(multi_race_indices)
    ][["subject_id", "hadm_id", "race", "admittime", "admission_type", "admission_location"]]
multi_race_encounters

Unnamed: 0,subject_id,hadm_id,race,admittime,admission_type,admission_location
6419,10154376,24770530,SOUTH AMERICAN,2178-09-15 02:10:00,EU OBSERVATION,EMERGENCY ROOM
6420,10154376,26894987,UNKNOWN,2177-12-02 13:41:00,URGENT,PHYSICIAN REFERRAL
6421,10154376,27022201,HISPANIC OR LATINO,2174-07-28 18:23:00,EW EMER.,EMERGENCY ROOM
6422,10154376,27499017,HISPANIC OR LATINO,2174-08-09 22:34:00,EW EMER.,EMERGENCY ROOM
6423,10154376,27745331,UNKNOWN,2176-09-11 01:00:00,URGENT,PHYSICIAN REFERRAL
...,...,...,...,...,...,...
429330,19954807,21606941,SOUTH AMERICAN,2189-03-18 20:07:00,EW EMER.,PACU
429331,19954807,22024006,HISPANIC/LATINO - SALVADORAN,2193-09-22 20:19:00,OBSERVATION ADMIT,EMERGENCY ROOM
429332,19954807,23679253,HISPANIC/LATINO - SALVADORAN,2189-12-12 21:35:00,EU OBSERVATION,EMERGENCY ROOM
429333,19954807,27508241,HISPANIC/LATINO - SALVADORAN,2191-11-10 23:10:00,EU OBSERVATION,EMERGENCY ROOM


In [18]:
(sa_race_counts == 1).sum()

212

In [16]:
len(multi_race_encounters["subject_id"].unique())

56

# `limited_identifiers` table

## dev

In [None]:
# adm and discharge time from `admissions` table



# `Encounter_demographics_disposition` table

In [35]:
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [37]:
admissions.value_counts("discharge_location").reset_index()

Unnamed: 0,discharge_location,count
0,HOME,155423
1,HOME HEALTH CARE,75572
2,SKILLED NURSING FACILITY,43024
3,REHAB,10523
4,DIED,8511
5,CHRONIC/LONG TERM ACUTE CARE,7144
6,HOSPICE,3469
7,AGAINST ADVICE,2590
8,PSYCH FACILITY,2262
9,ACUTE HOSPITAL,1610


# `ADT` table

In [29]:
transfers.head(10)

Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime
0,10000032,22595853.0,33258284,ED,Emergency Department,2180-05-06 19:17:00,2180-05-06 23:30:00
1,10000032,22595853.0,35223874,admit,Transplant,2180-05-06 23:30:00,2180-05-07 17:21:27
2,10000032,22595853.0,36904543,discharge,,2180-05-07 17:21:27,
3,10000032,22841357.0,34100253,discharge,,2180-06-27 18:49:12,
4,10000032,22841357.0,34703856,admit,Transplant,2180-06-26 21:31:00,2180-06-27 18:49:12
5,10000032,22841357.0,38112554,ED,Emergency Department,2180-06-26 15:54:00,2180-06-26 21:31:00
6,10000032,25742920.0,35509340,admit,Transplant,2180-08-06 01:44:00,2180-08-07 17:50:44
7,10000032,25742920.0,35968195,ED,Emergency Department,2180-08-05 20:58:00,2180-08-06 01:44:00
8,10000032,25742920.0,38883756,discharge,,2180-08-07 17:50:44,
9,10000032,29079034.0,32952584,ED,Emergency Department,2180-07-22 16:24:00,2180-07-23 05:54:00


In [25]:
transfers.value_counts("careunit").reset_index()

Unnamed: 0,careunit,count
0,Emergency Department,625907
1,Medicine,149709
2,Emergency Department Observation,80909
3,Discharge Lounge,52950
4,Med/Surg,47581
5,Medicine/Cardiology,44014
6,Neurology,39126
7,Hematology/Oncology,35542
8,Vascular,30939
9,Transplant,29963


In [30]:
transfers[
    transfers["careunit"] == "Discharge Lounge"
]

Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime
62,10000560,28979390.0,34247338,admit,Discharge Lounge,2189-10-15 02:27:58,2189-10-15 19:57:50
118,10000935,25849114.0,32096064,transfer,Discharge Lounge,2187-10-10 20:55:17,2187-10-10 21:11:13
176,10001186,21334040.0,37686359,admit,Discharge Lounge,2190-07-19 04:33:17,2190-07-19 18:39:09
179,10001186,24016413.0,36417468,admit,Discharge Lounge,2188-10-20 02:48:19,2188-10-20 21:46:12
223,10001401,21544441.0,39792652,admit,Discharge Lounge,2131-06-04 06:57:13,2131-06-04 21:48:12
...,...,...,...,...,...,...,...
1890859,19999442,26785317.0,34494544,admit,Discharge Lounge,2148-11-19 03:53:10,2148-11-19 14:23:43
1890908,19999784,23664472.0,35775053,admit,Discharge Lounge,2119-07-24 04:00:12,2119-07-24 19:29:30
1890927,19999784,26194817.0,37769396,transfer,Discharge Lounge,2119-06-18 21:48:57,2119-06-19 00:08:37
1890929,19999784,26194817.0,39890059,transfer,Discharge Lounge,2119-06-19 00:25:10,2119-06-19 16:10:16


In [26]:
transfers.value_counts("eventtype").reset_index()

Unnamed: 0,eventtype,count
0,ED,625907
1,admit,431241
2,discharge,431231
3,transfer,402593


In [31]:
adt_events_units = transfers.value_counts(["eventtype", "careunit"], dropna=False).reset_index()
adt_events_units

Unnamed: 0,eventtype,careunit,count
0,ED,Emergency Department,625907
1,discharge,,431231
2,admit,Medicine,79483
3,admit,Emergency Department Observation,72522
4,transfer,Medicine,70226
...,...,...,...
73,admit,Obstetrics Antepartum,700
74,admit,Neuro Stepdown,504
75,admit,Obstetrics Postpartum,104
76,admit,Unknown,72


# `vitals` table

## utils

In [None]:
vital_col_names = ["encounter_id", "recorded_dttm", "vital_name", "vital_category", "vital_value", "meas_site_name"]

vitals_temp_site_mapper_dict = {
    'Oral': 'not specified', 'Blood': 'not specified', 'Axillary': 'not specified', 
    'Rectal': 'core', 'Esophogeal': 'core', 'Temporal': 'not specified', 'Tympanic': 'core', 'NA': "not specified"}

vitals_col_rename_mapper_dict = {
    "hadm_id": "encounter_id", 
    "time": "recorded_dttm",
    "value": "vital_value"
    }

@lru_cache()
def convert_f_to_c(temp_f):
    if isinstance(temp_f, str) or isinstance(temp_f, int):
        temp_f = float(temp_f) 
    
    if isinstance(temp_f, float):
        temp_c = (temp_f - 32) * 5 / 9
        return round(temp_c, 1) # so 39.3333 -> 39.3
    else:
        raise("wrong type")

## regular cases

In [50]:
# find vital_items_ids
vitals_items_ids = get_relevant_item_ids(mapping_df = vitals_mapping, decision_col="meas_site_name")
vitals_events = item_ids_list_to_events_df(vitals_items_ids)
vitals_events

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom
0,10000032,29079034,39553978,2180-07-23 21:01:00,220179,82,mmHg
4,10000032,29079034,39553978,2180-07-23 22:00:00,220179,85,mmHg
10,10000032,29079034,39553978,2180-07-23 19:00:00,220179,93,mmHg
21,10000032,29079034,39553978,2180-07-23 20:00:00,220179,90,mmHg
163,10000032,29079034,39553978,2180-07-23 14:11:00,220179,84,mmHg
...,...,...,...,...,...,...,...
313446049,19995595,21784060,34670930,2126-10-22 04:00:00,225312,64,mmHg
313446200,19995595,21784060,34670930,2126-10-22 06:00:00,225312,97,mmHg
313452288,19995595,21784060,34670930,2126-10-21 21:00:00,225312,62,mmHg
313452316,19995595,21784060,34670930,2126-10-21 23:00:00,225312,71,mmHg


In [58]:
vitals_events["vital_name"] = vitals_events["itemid"].apply(lambda x: vital_name_mapper_dict[x])
vitals_events["vital_category"] = vitals_events["itemid"].apply(lambda x: vital_category_mapper_dict[x])
vitals_events["meas_site_name"] = vitals_events["itemid"].apply(lambda x: vital_site_mapper_dict[x])
# FIXME: efficiency

In [68]:
vitals_events

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,vital_name,vital_category,meas_site_name
0,10000032,29079034,39553978,2180-07-23 21:01:00,220179,82,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
4,10000032,29079034,39553978,2180-07-23 22:00:00,220179,85,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
10,10000032,29079034,39553978,2180-07-23 19:00:00,220179,93,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
21,10000032,29079034,39553978,2180-07-23 20:00:00,220179,90,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
163,10000032,29079034,39553978,2180-07-23 14:11:00,220179,84,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
...,...,...,...,...,...,...,...,...,...,...
313446049,19995595,21784060,34670930,2126-10-22 04:00:00,225312,64,mmHg,ART BP Mean,map,arterial
313446200,19995595,21784060,34670930,2126-10-22 06:00:00,225312,97,mmHg,ART BP Mean,map,arterial
313452288,19995595,21784060,34670930,2126-10-21 21:00:00,225312,62,mmHg,ART BP Mean,map,arterial
313452316,19995595,21784060,34670930,2126-10-21 23:00:00,225312,71,mmHg,ART BP Mean,map,arterial


In [76]:
vitals_final = rename_and_reorder_cols(vitals_events, vitals_col_rename_mapper_dict, vital_col_names)
vitals_final

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,29079034,2180-07-23 21:01:00,Non Invasive Blood Pressure systolic,sbp,82,not specified
4,29079034,2180-07-23 22:00:00,Non Invasive Blood Pressure systolic,sbp,85,not specified
10,29079034,2180-07-23 19:00:00,Non Invasive Blood Pressure systolic,sbp,93,not specified
21,29079034,2180-07-23 20:00:00,Non Invasive Blood Pressure systolic,sbp,90,not specified
163,29079034,2180-07-23 14:11:00,Non Invasive Blood Pressure systolic,sbp,84,not specified
...,...,...,...,...,...,...
313446049,21784060,2126-10-22 04:00:00,ART BP Mean,map,64,arterial
313446200,21784060,2126-10-22 06:00:00,ART BP Mean,map,97,arterial
313452288,21784060,2126-10-21 21:00:00,ART BP Mean,map,62,arterial
313452316,21784060,2126-10-21 23:00:00,ART BP Mean,map,71,arterial


### validation over duplicates

In [77]:
check_duplicates(vitals_final, ["vital_category", "vital_value"] )

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name


## `temp_c` special case

In [41]:
temp_events = item_ids_list_to_events_df([223761, 223762, 224642])
temp_events

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom
17,10000032,29079034,39553978,2180-07-23 20:00:00,223761,99.5,°F
162,10000032,29079034,39553978,2180-07-23 14:00:00,223761,98.7,°F
371,10000032,29079034,39553978,2180-07-23 17:00:00,223761,98.7,°F
550,10000980,26913865,39765666,2189-06-27 09:07:00,223761,98,°F
730,10000980,26913865,39765666,2189-06-27 12:00:00,223761,97.7,°F
...,...,...,...,...,...,...,...
313644553,19999987,23865745,36195440,2145-11-03 04:00:00,224642,Oral,
313644735,19999987,23865745,36195440,2145-11-02 23:24:00,224642,Oral,
313644811,19999987,23865745,36195440,2145-11-02 23:41:00,224642,Oral,
313644876,19999987,23865745,36195440,2145-11-04 20:00:00,224642,Oral,


In [12]:
# pivot directly
temp_wider = temp_events.pivot(
    index = ["hadm_id", "charttime"], 
    columns = "itemid",
    values = "value"
    ).reset_index()
temp_wider

itemid,hadm_id,charttime,223761,223762,224642
0,20000094,2150-03-02 15:30:00,97.5,,Oral
1,20000094,2150-03-02 17:00:00,97.5,,Oral
2,20000094,2150-03-02 20:00:00,96.3,,Axillary
3,20000094,2150-03-03 04:00:00,99,,Rectal
4,20000147,2121-08-30 22:11:00,97.6,,Oral
...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,99.1,,Oral
1920381,29999828,2180-10-29 09:00:00,98.2,,Oral
1920382,29999828,2180-10-29 12:00:00,99.5,,Oral
1920383,29999828,2180-10-29 14:00:00,99,,Oral


In [16]:
# map temp_site to the clif categories of meas_site_name
temp_wider["meas_site_name"] = temp_wider[224642].apply(
    lambda x: vitals_temp_site_mapper_dict[x] if pd.notna(x) else "not specified"
)

# 223761 = temp in f, 223762 = temp in c
temp_wider["vital_value"] = temp_wider[223762].fillna(
    temp_wider[223761].apply(convert_f_to_c)
    )

temp_wider['vital_name'] = temp_wider.apply(
    lambda row: "Temperature Celsius" if pd.notna(row[223762]) else "Temperature Fahrenheit", 
    axis = "columns"
    )

temp_wider["vital_category"] = "temp_c"

In [64]:
temp_final = rename_and_reorder_cols(temp_wider, vitals_col_rename_mapper_dict, vital_col_names)
temp_final

itemid,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,20000094,2150-03-02 15:30:00,Temperature Fahrenheit,temp_c,36.4,not specified
1,20000094,2150-03-02 17:00:00,Temperature Fahrenheit,temp_c,36.4,not specified
2,20000094,2150-03-02 20:00:00,Temperature Fahrenheit,temp_c,35.7,not specified
3,20000094,2150-03-03 04:00:00,Temperature Fahrenheit,temp_c,37.2,core
4,20000147,2121-08-30 22:11:00,Temperature Fahrenheit,temp_c,36.4,not specified
...,...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,Temperature Fahrenheit,temp_c,37.3,not specified
1920381,29999828,2180-10-29 09:00:00,Temperature Fahrenheit,temp_c,36.8,not specified
1920382,29999828,2180-10-29 12:00:00,Temperature Fahrenheit,temp_c,37.5,not specified
1920383,29999828,2180-10-29 14:00:00,Temperature Fahrenheit,temp_c,37.2,not specified


In [39]:
temp_final = (
    temp_wider
    .rename(columns=vitals_col_rename_mapper_dict)
    .reindex(columns = vital_col_names)
)

temp_final

itemid,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,20000094,2150-03-02 15:30:00,Temperature Fahrenheit,temp_c,36.4,not specified
1,20000094,2150-03-02 17:00:00,Temperature Fahrenheit,temp_c,36.4,not specified
2,20000094,2150-03-02 20:00:00,Temperature Fahrenheit,temp_c,35.7,not specified
3,20000094,2150-03-03 04:00:00,Temperature Fahrenheit,temp_c,37.2,core
4,20000147,2121-08-30 22:11:00,Temperature Fahrenheit,temp_c,36.4,not specified
...,...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,Temperature Fahrenheit,temp_c,37.3,not specified
1920381,29999828,2180-10-29 09:00:00,Temperature Fahrenheit,temp_c,36.8,not specified
1920382,29999828,2180-10-29 12:00:00,Temperature Fahrenheit,temp_c,37.5,not specified
1920383,29999828,2180-10-29 14:00:00,Temperature Fahrenheit,temp_c,37.2,not specified


### validation over duplicates

In [72]:
check_duplicates(temp_final, ["vital_category",	"vital_value"] )

itemid,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name


## merge and save

In [81]:
# COMBINE:
vitals_merged = pd.concat([
    vitals_final, temp_final
])
vitals_merged

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,29079034,2180-07-23 21:01:00,Non Invasive Blood Pressure systolic,sbp,82,not specified
4,29079034,2180-07-23 22:00:00,Non Invasive Blood Pressure systolic,sbp,85,not specified
10,29079034,2180-07-23 19:00:00,Non Invasive Blood Pressure systolic,sbp,93,not specified
21,29079034,2180-07-23 20:00:00,Non Invasive Blood Pressure systolic,sbp,90,not specified
163,29079034,2180-07-23 14:11:00,Non Invasive Blood Pressure systolic,sbp,84,not specified
...,...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,Temperature Fahrenheit,temp_c,37.3,not specified
1920381,29999828,2180-10-29 09:00:00,Temperature Fahrenheit,temp_c,36.8,not specified
1920382,29999828,2180-10-29 12:00:00,Temperature Fahrenheit,temp_c,37.5,not specified
1920383,29999828,2180-10-29 14:00:00,Temperature Fahrenheit,temp_c,37.2,not specified


### validate the dtypes

In [82]:
vitals_merged.dtypes

encounter_id       int64
recorded_dttm     object
vital_name        object
vital_category    object
vital_value       object
meas_site_name    object
dtype: object

In [83]:
vitals_merged["vital_value"] = vitals_merged["vital_value"].apply(float)

In [85]:
vitals_merged

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,29079034,2180-07-23 21:01:00,Non Invasive Blood Pressure systolic,sbp,82.0,not specified
4,29079034,2180-07-23 22:00:00,Non Invasive Blood Pressure systolic,sbp,85.0,not specified
10,29079034,2180-07-23 19:00:00,Non Invasive Blood Pressure systolic,sbp,93.0,not specified
21,29079034,2180-07-23 20:00:00,Non Invasive Blood Pressure systolic,sbp,90.0,not specified
163,29079034,2180-07-23 14:11:00,Non Invasive Blood Pressure systolic,sbp,84.0,not specified
...,...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,Temperature Fahrenheit,temp_c,37.3,not specified
1920381,29999828,2180-10-29 09:00:00,Temperature Fahrenheit,temp_c,36.8,not specified
1920382,29999828,2180-10-29 12:00:00,Temperature Fahrenheit,temp_c,37.5,not specified
1920383,29999828,2180-10-29 14:00:00,Temperature Fahrenheit,temp_c,37.2,not specified


In [84]:
# save
vitals_merged.to_parquet('../rclif/clif_vitals.parquet')

# `respiratory_support` table

## utils

In [97]:
resp_item_ids = get_relevant_item_ids(mapping_df = resp_mapping, decision_col = "variable")
resp_item_ids

array([220210, 220277, 226732, 223835, 220339, 224685, 224687, 224695,
       223834, 223848, 224690, 223849, 224688, 224684, 224738, 224686,
       224701, 224696, 229314, 224691, 224700, 225448, 226237, 227287,
       227577, 227579, 227580, 227582, 224702, 227581])

In [98]:
resp_events: pd.DataFrame = item_ids_list_to_events_df(resp_item_ids)
resp_events.head()

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom
7,10000032,29079034,39553978,2180-07-23 22:00:00,220210,20,insp/min
13,10000032,29079034,39553978,2180-07-23 19:00:00,220210,16,insp/min
24,10000032,29079034,39553978,2180-07-23 20:00:00,220210,19,insp/min
62,10000032,29079034,39553978,2180-07-23 21:00:00,220210,22,insp/min
167,10000032,29079034,39553978,2180-07-23 14:12:00,220210,24,insp/min


In [17]:
resp_events["variable"] = resp_events["itemid"].apply(lambda x: resp_mapper_dict[x])

In [18]:
resp_events

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,variable
7,10000032,29079034,39553978,2180-07-23 22:00:00,220210,20,insp/min,resp_rate_obs
13,10000032,29079034,39553978,2180-07-23 19:00:00,220210,16,insp/min,resp_rate_obs
24,10000032,29079034,39553978,2180-07-23 20:00:00,220210,19,insp/min,resp_rate_obs
62,10000032,29079034,39553978,2180-07-23 21:00:00,220210,22,insp/min,resp_rate_obs
167,10000032,29079034,39553978,2180-07-23 14:12:00,220210,24,insp/min,resp_rate_obs
...,...,...,...,...,...,...,...,...
307312672,19809456,21636156,35472925,2176-12-18 22:24:00,227581,10,bpm,resp_rate_set
307312680,19809456,21636156,35472925,2176-12-19 04:00:00,227581,10,bpm,resp_rate_set
309766680,19878911,28565267,32426428,2118-12-07 20:00:00,227581,16,bpm,resp_rate_set
309766693,19878911,28565267,32426428,2118-12-08 00:00:00,227581,16,bpm,resp_rate_set


In [19]:
resp_columns = [
    "encounter_id", "recorded_dttm", "device_name", "device_category", 
    "mode_name", "mode_category", "tracheostomy", "fio2_set", "lpm_set",
    "tidal_volume_set", "resp_rate_set", "pressure_control_set", "pressure_support_set",
    "flow_rate_set", "peak_inspiratory_pressure_set", "inspiratory_time_set",
    "peep_set", "tidal_volume_obs", "resp_rate_obs", "plateau_pressure_obs",
    "peak_inspiratory_pressure_obs", "peep_obs", "minute_vent_obs"
    ]

In [20]:
resp_device_rank = ["Vent", "NIPPV", "CPAP", "High Flow NC", "Face Mask", "Trach Collar", "Nasal Cannula", "Room Air", "Other"]
resp_device_rank.index("Vent")

In [21]:
def map_one_stay(stay_id, table_events: pd.DataFrame):
    '''
    find all the relevant items within one stay, and match by charttime.
    
    - table_events: a df of all the events corresponding to that table.
    '''      
    stay_events: pd.DataFrame = table_events.loc[
        table_events["stay_id"] == stay_id, :
    ]
    
    #return stay_events
    # use helper func to check for duplicates
    duplicates = find_resp_duplicates(stay_events)
    # print(duplicates)
    # if there are duplicates, first remove them from the same `stay_events` df
    if not duplicates.empty:
        top_ranked_indices = duplicates.groupby(["hadm_id", "time", "itemid"])["rank"].idxmin()
        # non top-ranked categories to be dropped
        non_top_ranked_indices = duplicates.index.difference(top_ranked_indices)
        stay_events.drop(non_top_ranked_indices, inplace = True)
    
    stay_events = stay_events.assign(
        label = lambda df: df["itemid"].apply(item_id_to_label),
        variable = lambda df: df["itemid"].apply(lambda x: resp_mapper_dict[x])
    )

    return stay_events # return this for now, TODO: incorporate more after the code is tested

    wider = stay_events.pivot(
        index = ["hadm_id", "time"], 
        columns = ["itemid"],
        values = "value" # keep this just a str, not a list, to avoid having double col index
    ).reset_index()
    
    return wider # return this for now, TODO: incorporate more when the code is tested

## EDA

#### check duplicates

In [24]:
resp_duplicates.query("itemid == 224696") 

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,variable
297727605,19529415,26871621,36123037,2166-07-20 01:00:00,224696,26,cmH2O,plateau_pressure_obs
297729572,19529415,26871621,39214730,2166-07-20 01:00:00,224696,37,cmH2O,plateau_pressure_obs


In [25]:
# 2166-07-19 23:50:00	2166-07-20 01:20:08
resp_events.query("stay_id == 39214730").sort_values("time")

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,variable
297729518,19529415,26871621,39214730,2166-07-20 00:00:00,223848,Drager,,device_name
297729520,19529415,26871621,39214730,2166-07-20 00:00:00,224700,8,cmH2O,peep_obs
297729523,19529415,26871621,39214730,2166-07-20 00:00:00,226732,Tracheostomy tube,,device_name
297729519,19529415,26871621,39214730,2166-07-20 00:00:00,224696,35,cmH2O,plateau_pressure_obs
297729546,19529415,26871621,39214730,2166-07-20 00:17:00,224738,0.6,sec,inspiratory_time_set
297729542,19529415,26871621,39214730,2166-07-20 00:17:00,224690,28,insp/min,resp_rate_obs
297729544,19529415,26871621,39214730,2166-07-20 00:17:00,224695,45,cmH2O,peak_inspiratory_pressure_obs
297729537,19529415,26871621,39214730,2166-07-20 00:17:00,224684,400,mL,tidal_volume_set
297729539,19529415,26871621,39214730,2166-07-20 00:17:00,224687,14.8,L/min,minute_vent_obs
297729538,19529415,26871621,39214730,2166-07-20 00:17:00,224685,421,mL,tidal_volume_obs


In [26]:
# 2166-07-20 04:34:07 -> 2166-07-21 18:06:26
resp_events.query("stay_id == 36123037").sort_values("time").head(30)

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,variable
297727600,19529415,26871621,36123037,2166-07-20 01:00:00,226732,Tracheostomy tube,,device_name
297727606,19529415,26871621,36123037,2166-07-20 01:00:00,224700,12,cmH2O,peep_obs
297727605,19529415,26871621,36123037,2166-07-20 01:00:00,224696,26,cmH2O,plateau_pressure_obs
297727595,19529415,26871621,36123037,2166-07-20 01:00:00,223848,Drager,,device_name
297727694,19529415,26871621,36123037,2166-07-20 01:21:00,220210,36,insp/min,resp_rate_obs
297727627,19529415,26871621,36123037,2166-07-20 01:21:00,223835,60,,fio2_set
297727626,19529415,26871621,36123037,2166-07-20 01:21:00,220339,10,cmH2O,peep_set
297727632,19529415,26871621,36123037,2166-07-20 01:21:00,224685,366,mL,tidal_volume_obs
297727633,19529415,26871621,36123037,2166-07-20 01:21:00,224687,11,L/min,minute_vent_obs
297727637,19529415,26871621,36123037,2166-07-20 01:21:00,224695,31,cmH2O,peak_inspiratory_pressure_obs


In [27]:
icustays.query("hadm_id == 26871621") 

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
69624,19529415,26871621,36123037,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2166-07-20 04:34:07,2166-07-21 18:06:26,1.564109
69625,19529415,26871621,39214730,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2166-07-19 23:50:00,2166-07-20 01:20:08,0.062593


In [28]:
# find the time ranges for the two stays, and see which stay should the 1 am measurement fall into
# so should be stay_id == 39214730
transfers.query("hadm_id == 26871621").sort_values("intime")

Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime
1801138,19529415,26871621.0,31849044,ED,Emergency Department,2166-07-19 21:05:00,2166-07-19 23:50:00
1801142,19529415,26871621.0,39214730,admit,Cardiac Vascular Intensive Care Unit (CVICU),2166-07-19 23:50:00,2166-07-20 01:20:08
1801141,19529415,26871621.0,38963722,transfer,Discharge Lounge,2166-07-20 01:20:08,2166-07-20 04:34:07
1801140,19529415,26871621.0,36123037,transfer,Cardiac Vascular Intensive Care Unit (CVICU),2166-07-20 04:34:07,2166-07-21 18:06:26
1801139,19529415,26871621.0,34433983,discharge,,2166-07-21 18:06:26,


## cleaning

In [111]:
# remove duplicates to prepare for pivoting 
# two kinds of duplicates to handle: by devices and other
resp_duplicates: pd.DataFrame = find_duplicates(resp_events)

In [112]:
# 1/ deal with devices
resp_duplicates_devices: pd.DataFrame = resp_duplicates.query("itemid == 226732").copy()
resp_duplicates_devices["device_category"] = resp_duplicates_devices["value"].apply(
    lambda x: resp_device_mapper_dict[x.strip()] if pd.notna(x) else None
    )
resp_duplicates_devices.dropna(subset="device_category",inplace=True)
resp_duplicates_devices["rank"] = resp_duplicates_devices["device_category"].apply(
    lambda x: resp_device_rank.index(x.strip()))
resp_duplicates_devices

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,device_category,rank
8494,10001884,26184834,37510196,2131-01-19 18:40:00,226732,Aerosol-cool,,Other,8
8495,10001884,26184834,37510196,2131-01-19 18:40:00,226732,Endotracheal tube,,Vent,0
8496,10001884,26184834,37510196,2131-01-19 18:40:00,226732,Face tent,,Face Mask,4
8504,10001884,26184834,37510196,2131-01-19 18:45:00,226732,Aerosol-cool,,Other,8
8505,10001884,26184834,37510196,2131-01-19 18:45:00,226732,Endotracheal tube,,Vent,0
...,...,...,...,...,...,...,...,...,...
313631762,19999442,26785317,32336619,2148-11-24 16:20:00,226732,Other,,Other,8
313642973,19999987,23865745,36195440,2145-11-04 07:30:00,226732,Face tent,,Face Mask,4
313642974,19999987,23865745,36195440,2145-11-04 07:30:00,226732,High flow neb,,Other,8
313644011,19999987,23865745,36195440,2145-11-04 09:00:00,226732,Face tent,,Face Mask,4


In [113]:
# deal with the device case - find indices to drop
top_ranked_device_indices = resp_duplicates_devices.groupby(["hadm_id", "time", "itemid"])["rank"].idxmin()
# non top-ranked categories to be dropped
lower_ranked_device_indices = resp_duplicates_devices.index.difference(top_ranked_device_indices)
# drop the designated indices
resp_events_clean = resp_events.drop(lower_ranked_device_indices)
# drop None
resp_events_clean.dropna(subset = "value", inplace=True)

In [116]:
# 2/ deal with duplicate vent reads:
setting_duplicate_indices_to_drop = find_duplicates(resp_events_clean).query("stay_id == 36123037").index
resp_events_clean.drop(setting_duplicate_indices_to_drop, inplace = True)

In [117]:
# check all duplicates are dropped
find_duplicates(resp_events_clean)

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom


In [None]:
# create two columns based on item_id: 
resp_events_clean["label"] = resp_events_clean["itemid"].apply(item_id_to_label)
resp_events_clean["variable"] = resp_events_clean["itemid"].apply(lambda x: resp_mapper_dict[x])

In [132]:
resp_events_clean

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,label,variable
7,10000032,29079034,39553978,2180-07-23 22:00:00,220210,20,insp/min,Respiratory Rate,resp_rate_obs
13,10000032,29079034,39553978,2180-07-23 19:00:00,220210,16,insp/min,Respiratory Rate,resp_rate_obs
24,10000032,29079034,39553978,2180-07-23 20:00:00,220210,19,insp/min,Respiratory Rate,resp_rate_obs
62,10000032,29079034,39553978,2180-07-23 21:00:00,220210,22,insp/min,Respiratory Rate,resp_rate_obs
167,10000032,29079034,39553978,2180-07-23 14:12:00,220210,24,insp/min,Respiratory Rate,resp_rate_obs
...,...,...,...,...,...,...,...,...,...
307312672,19809456,21636156,35472925,2176-12-18 22:24:00,227581,10,bpm,BiPap bpm (S/T -Back up),resp_rate_set
307312680,19809456,21636156,35472925,2176-12-19 04:00:00,227581,10,bpm,BiPap bpm (S/T -Back up),resp_rate_set
309766680,19878911,28565267,32426428,2118-12-07 20:00:00,227581,16,bpm,BiPap bpm (S/T -Back up),resp_rate_set
309766693,19878911,28565267,32426428,2118-12-08 00:00:00,227581,16,bpm,BiPap bpm (S/T -Back up),resp_rate_set


## pivoting

In [143]:
resp_wider = resp_events_clean.pivot(
    index = ["hadm_id", "time"], 
    columns = ["variable", "label"],
    values = "value" 
)

In [147]:
resp_wider.columns.sort_index(level=0)

AttributeError: 'MultiIndex' object has no attribute 'sort_index'

In [137]:
resp_wider["lmp_set"] = resp_wider[223834].fillna(resp_wider[227287])

Unnamed: 0_level_0,variable,resp_rate_obs,ALREADY MAPPED,device_name,fio2_set,peep_set,tidal_volume_obs,minute_vent_obs,peak_inspiratory_pressure_obs,lpm_set,device_name,...,peep_obs,tracheostomy,tracheostomy,lpm_set,mode_name,peep_set,peak_inspiratory_pressure_set,flow_rate_set,pressure_control_set,resp_rate_set
Unnamed: 0_level_1,label,Respiratory Rate,O2 saturation pulseoxymetry,O2 Delivery Device(s),Inspired O2 Fraction,PEEP set,Tidal Volume (observed),Minute Volume,Peak Insp. Pressure,O2 Flow,Ventilator Type,...,Total PEEP Level,Percutaneous Tracheostomy,Open Tracheostomy,O2 Flow (additional cannula),BiPap Mode,BiPap EPAP,BiPap IPAP,BiPap O2 Flow,PCV Level,BiPap bpm (S/T -Back up)
hadm_id,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
20000094,2150-03-02 15:19:00,19,,,,,,,,,,...,,,,,,,,,,
20000094,2150-03-02 15:27:00,,100,,,,,,,,,...,,,,,,,,,,
20000094,2150-03-02 15:28:00,30,,,,,,,,,,...,,,,,,,,,,
20000094,2150-03-02 15:30:00,25,,,,,,,,,,...,,,,,,,,,,
20000094,2150-03-02 15:34:00,,,Nasal cannula,,,,,,4,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29999828,2180-10-29 12:00:00,19,92,,,,,,,,,...,,,,,,,,,,
29999828,2180-10-29 13:00:00,16,94,,,,,,,,,...,,,,,,,,,,
29999828,2180-10-29 14:00:00,17,96,,,,,,,,,...,,,,,,,,,,
29999828,2180-10-29 15:09:00,14,96,,,,,,,,,...,,,,,,,,,,


# `labs` table

## utils

In [16]:
def item_id_to_uom_instances_categorical(item_id: int, events: pd.DataFrame = chartevents):
    '''
    Return all the unique categories
    '''
    print(f"checking lab item {item_id}")
    assoc_events = events.loc[events["itemid"] == item_id, :]
    categories = assoc_events.value_counts("valueuom").to_dict()
    return str(categories)

In [23]:
def gen_lab_item_candidate_table(kw, blood_only = True, case = False):  
    lab_items_select = d_labitems[d_labitems["label"].str.contains(kw,case=case, na=False)]
    if blood_only:
        lab_items_select = lab_items_select[(d_labitems["fluid"] == "Blood")]
    
    lab_items_cand = (
        lab_items_select
        .join(labevents.value_counts("itemid"), on = "itemid")
        .loc[lambda df: df["count"].notna(), :]
        .sort_values("count", ascending=False)
    )
    
    # add values instances
    lab_items_cand["value_instances"] = lab_items_cand["itemid"].apply(
        lambda x: item_id_to_value_instances_numeric(x, labevents)
        )

    # add valueuom instances
    lab_items_cand["uom_instances"] = lab_items_cand["itemid"].apply(
        lambda x: item_id_to_uom_instances_categorical(x, labevents)
        )
    
    return lab_items_cand

## EDA

In [67]:
d_labitems

Unnamed: 0,itemid,label,fluid,category
0,50801,Alveolar-arterial Gradient,Blood,Blood Gas
1,50802,Base Excess,Blood,Blood Gas
2,50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
3,50804,Calculated Total CO2,Blood,Blood Gas
4,50805,Carboxyhemoglobin,Blood,Blood Gas
...,...,...,...,...
1617,53150,Anti Hbs,Blood,Chemistry
1618,53151,Anti-la,Blood,Chemistry
1619,53152,HIV FINAL,Blood,Chemistry
1620,53153,HIV Screen,Blood,Chemistry


In [66]:
labevents

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
0,1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.40,,0.9,1.1,abnormal,ROUTINE,
1,2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.10,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.
2,3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15.00,ng/mL,30.0,60.0,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ...
3,4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102.00,IU/L,0.0,40.0,abnormal,ROUTINE,
4,5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.30,g/dL,3.5,5.2,abnormal,ROUTINE,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118171362,118352501,19999987,23865745.0,85842100,51279,,2145-11-09 05:30:00,2145-11-09 07:06:00,3.52,3.52,m/uL,4.2,5.4,abnormal,ROUTINE,
118171363,118352502,19999987,23865745.0,85842100,51301,,2145-11-09 05:30:00,2145-11-09 07:06:00,5.7,5.70,K/uL,4.0,11.0,,ROUTINE,
118171364,118352503,19999987,,12592768,50912,P09IS0,2146-02-07 11:13:00,2146-02-07 16:26:00,1.1,1.10,mg/dL,0.4,1.1,,ROUTINE,
118171365,118352504,19999987,,12592768,50920,P09IS0,2146-02-07 11:13:00,2146-02-07 16:26:00,,,,,,,ROUTINE,"Using this patient's age, gender, and serum cr..."


In [42]:
gen_lab_item_candidate_table("saturation")

  lab_items_select = lab_items_select[(d_labitems["fluid"] == "Blood")]


checking lab item 50817


Unnamed: 0,itemid,label,fluid,category,count,value_instances,uom_instances
15,50817,Oxygen Saturation,Blood,Blood Gas,176225.0,"Max: 998.0, Min: 0.2, Mean: 84.6",{'%': 176225}


In [44]:
ph_table = gen_lab_item_candidate_table("ph")

  lab_items_select = lab_items_select[(d_labitems["fluid"] == "Blood")]


checking lab item 50970
checking lab item 51256
checking lab item 51244
checking lab item 51200
checking lab item 51146
checking lab item 50863
checking lab item 50820
checking lab item 52075
checking lab item 52073
checking lab item 51133
checking lab item 52069
checking lab item 51143
checking lab item 50856
checking lab item 50975
checking lab item 50864
checking lab item 52769
checking lab item 51245
checking lab item 50967
checking lab item 52171
checking lab item 51292
checking lab item 51145
checking lab item 50872
checking lab item 51748
checking lab item 51749
checking lab item 50966
checking lab item 51234
checking lab item 50968
checking lab item 50969
checking lab item 51232
checking lab item 51199
checking lab item 51201
checking lab item 50990
checking lab item 51241
checking lab item 52184


In [68]:
ph_table[
    ph_table["label"].str.contains("pH", case = True)
]

Unnamed: 0,itemid,label,fluid,category,count,value_instances,uom_instances
18,50820,pH,Blood,Blood Gas,564412.0,"Max: 8.92, Min: 0.0, Mean: 7.38",{'units': 564412}


In [45]:
ph_table

Unnamed: 0,itemid,label,fluid,category,count,value_instances,uom_instances
167,50970,Phosphate,Blood,Chemistry,2066052.0,"Max: 58.0, Min: 0.0, Mean: 3.55",{'mg/dL': 2066052}
441,51256,Neutrophils,Blood,Hematology,1241592.0,"Max: 100.0, Min: 0.0, Mean: 65.1",{'%': 1241592}
429,51244,Lymphocytes,Blood,Hematology,1241592.0,"Max: 100.0, Min: 0.0, Mean: 23.64",{'%': 1241592}
385,51200,Eosinophils,Blood,Hematology,1241587.0,"Max: 98.0, Min: 0.0, Mean: 2.19",{'%': 1241587}
333,51146,Basophils,Blood,Hematology,1241587.0,"Max: 63.0, Min: 0.0, Mean: 0.51",{'%': 1241587}
61,50863,Alkaline Phosphatase,Blood,Chemistry,1171421.0,"Max: 5965.0, Min: 0.0, Mean: 127.84",{'IU/L': 1171421}
18,50820,pH,Blood,Blood Gas,564412.0,"Max: 8.92, Min: 0.0, Mean: 7.38",{'units': 564412}
1168,52075,Absolute Neutrophil Count,Blood,Hematology,541658.0,"Max: 880.0, Min: 0.0, Mean: 5.67",{'K/uL': 541657}
1166,52073,Absolute Eosinophil Count,Blood,Hematology,518314.0,"Max: 56.36, Min: 0.0, Mean: 0.15",{'K/uL': 518313}
320,51133,Absolute Lymphocyte Count,Blood,Hematology,518272.0,"Max: 611.23, Min: 0.0, Mean: 1.85",{'K/uL': 518271}


# `medication_admin_continuous` table

## utils

In [50]:
emar = load_mimic_table("hosp", "emar", "pq")
emar_detail = load_mimic_table("hosp", "emar_detail", "pq")

  return pd.read_csv(f'../mimic-iv-2.2/{module}/{table}.csv.gz')


In [51]:
emar.to_parquet("../mimic-iv-2.2/hosp/emar.parquet")
emar_detail.to_parquet("../mimic-iv-2.2/hosp/emar_detail.parquet")

ArrowTypeError: ("Expected bytes, got a 'float' object", 'Conversion failed for column dose_given with type object')

In [72]:
ingredientevents = load_mimic_table("icu", "ingredientevents", "csv")

## EDA

In [81]:
inputevents.value_counts("ordercategorydescription")

ordercategorydescription
Continuous Med    3725846
Drug Push         2506912
Continuous IV     1495015
Bolus             1232089
Non Iv Meds         19031
Name: count, dtype: int64

In [87]:
d_items.dtypes

itemid               int64
label               object
abbreviation        object
linksto             object
category            object
unitname            object
param_type          object
lownormalvalue     float64
highnormalvalue    float64
dtype: object

In [88]:
d_items.set_index("itemid", inplace=True)

In [95]:
rate_label = inputevents[["itemid", "rate", "rateuom", "ordercategorydescription"]].join(d_items, on = "itemid")
rate_label

Unnamed: 0,itemid,rate,rateuom,ordercategorydescription,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,226452,,,Bolus,PO Intake,PO Intake,inputevents,Fluids/Intake,mL,Solution,,
1,226452,,,Bolus,PO Intake,PO Intake,inputevents,Fluids/Intake,mL,Solution,,
2,220862,100.000000,mL/hour,Continuous IV,Albumin 25%,Albumin 25%,inputevents,Blood Products/Colloids,mL,Solution,,
3,220862,100.000000,mL/hour,Continuous IV,Albumin 25%,Albumin 25%,inputevents,Blood Products/Colloids,mL,Solution,,
4,226452,,,Bolus,PO Intake,PO Intake,inputevents,Fluids/Intake,mL,Solution,,
...,...,...,...,...,...,...,...,...,...,...,...,...
8978888,225942,75.000008,mcg/hour,Continuous Med,Fentanyl (Concentrate),Fentanyl (Concentrate),inputevents,Medications,mg,Solution,,
8978889,225943,1.500000,mL/hour,Continuous Med,Solution,Solution,inputevents,Fluids/Intake,mL,Solution,,
8978890,225158,75.282310,mL/hour,Continuous IV,NaCl 0.9%,NaCl 0.9%,inputevents,Fluids/Intake,mL,Solution,,
8978891,226452,,,Bolus,PO Intake,PO Intake,inputevents,Fluids/Intake,mL,Solution,,


In [89]:
cont = inputevents.loc[
    inputevents["ordercategorydescription"].isin(["Continuous Med", "Continuous IV"]), :
].join(d_items, on = "itemid")

In [91]:
cont_label = cont.value_counts(["ordercategorydescription","label"])
cont_label

ordercategorydescription  label                      
Continuous Med            NaCl 0.9%                      779313
                          Dextrose 5%                    565131
                          Solution                       465997
                          Norepinephrine                 336000
                          Propofol                       321426
                                                          ...  
                          NaCl 0.45%                          1
Continuous IV             Enlive (Full)                       1
                          Pulmocare (1/4)                     1
                          Ensure (3/4)                        1
                          Boost Glucose Control (1/4)         1
Name: count, Length: 219, dtype: int64

In [80]:
item_id_to_label(227523)

searching for the label of item 227523


'Magnesium Sulfate (Bolus)'

In [60]:
ItemFinder("norepinephrine").candidate_table

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,count,value_instances
305,221906,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,336000,Solution


In [61]:
ItemFinder("epinephrine").candidate_table

searching for the label of item 221289
searching for the param_type of item 221289
searching for the label of item 229617
searching for the param_type of item 229617


Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,count,value_instances
305,221906,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,336000,Solution
280,221289,Epinephrine,Epinephrine,inputevents,Medications,mg,Solution,24470,Solution
3737,229617,Epinephrine.,Epinephrine.,inputevents,Medications,mg,Solution,133,Solution


In [65]:
ItemFinder("nicardipine").candidate_table

searching for the label of item 222042
searching for the param_type of item 222042
searching for the label of item 229624
searching for the param_type of item 229624


Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,count,value_instances
310,222042,Nicardipine,Nicardipine,inputevents,Medications,mg,Solution,37307,Solution
3741,229624,Nicardipine 40mg/200,Nicardipine 40mg/200,inputevents,Medications,mg,Solution,10376,Solution


In [75]:
ItemFinder("rocuronium").candidate_table

searching for the label of item 229233
searching for the param_type of item 229233
searching for the label of item 229788
searching for the param_type of item 229788
item label: Rocuronium (Intubation); value instances: Max: nan, Min: nan, Mean: nan


Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,count,value_instances
3463,229233,Rocuronium,Rocuronium,inputevents,Medications,mg,Solution,2458.0,Solution
3869,229788,Rocuronium (Intubation),Rocuronium (Intubation),chartevents,Intubation,mg,Numeric,,"Max: nan, Min: nan, Mean: nan"


In [96]:
item_id_to_label(227523)

'Magnesium Sulfate (Bolus)'

# `dialysis` table

## dev

In [38]:
datetimeevents.value_counts("itemid")

itemid
225754    512736
224288    432149
224290    410503
224287    404563
224280    395213
           ...  
227199        21
229886        14
229887         7
229891         3
229890         2
Name: count, Length: 170, dtype: int64

In [39]:
item_id_to_label(224288)

searching for the label of item 224288


'Arterial line Insertion Date'