# init

In [1]:
import numpy as np
import pandas as pd
import polars as pl
from timeit import timeit
import os, sys
print(os.getcwd())
os.chdir('/gpfs/data/healthcare-allocate/CLIF-MIMIC/code')
print(os.getcwd())

proj_root = "/gpfs/data/healthcare-allocate/CLIF-MIMIC"
if proj_root not in sys.path:
    sys.path.append(proj_root)

from code.custom_utils import *

/gpfs/data/healthcare-allocate/CLIF-MIMIC
/gpfs/data/healthcare-allocate/CLIF-MIMIC/code


In [2]:
def load_mimic_table(module, table, file_type):
    if file_type in ["pq", "parquet"]:
        return pd.read_parquet(f'../mimic-iv-2.2/{module}/{table}.parquet')
    elif file_type == "csv":
        return pd.read_csv(f'../mimic-iv-2.2/{module}/{table}.csv.gz')

d_items = load_mimic_table("icu", "d_items", "csv")
chartevents = load_mimic_table("icu", "chartevents", "parquet")
procedureevents = load_mimic_table("icu", "procedureevents", "csv")
datetimeevents = load_mimic_table("icu", "datetimeevents", "csv")
inputevents = load_mimic_table("icu", "inputevents", "csv")
outputevents = load_mimic_table("icu", "outputevents", "csv")

## load mappings

In [3]:
def load_mapping_csv(csv_name: str):
    return pd.read_csv(f"../mapping/mimic-to-clif-mappings - {csv_name}.csv")

In [44]:
vitals_mapping = load_mapping_csv("vitals")
resp_mapping = load_mapping_csv("respiratory_support")
resp_device_mapping = load_mapping_csv("device_category")
resp_mode_mapping = load_mapping_csv("mode_category")

In [56]:
# covert to a dict for df col renaming later
def construct_mapper_dict(mapping_df: pd.DataFrame, key_col: str, value_col: str):
    return dict(zip(mapping_df[key_col], mapping_df[value_col]))

# vitals table
vital_name_mapper_dict = construct_mapper_dict(vitals_mapping, "itemid", "label = vital_name")
vital_category_mapper_dict = construct_mapper_dict(vitals_mapping, "itemid", "vital_category")
vital_site_mapper_dict = construct_mapper_dict(vitals_mapping, "itemid", "meas_site_name")

# resp support table
resp_mapper_dict = construct_mapper_dict(resp_mapping, "itemid", "variable")
resp_device_mapper_dict = construct_mapper_dict(resp_device_mapping, "device_name", "device_category")
resp_mode_mapper_dict = construct_mapper_dict(resp_mode_mapping, "mode_name", "mode_category")


In [57]:
vital_site_mapper_dict

{220179: 'not specified',
 220050: 'arterial',
 220059: 'NO MAPPING',
 225309: 'arterial',
 228152: 'NO MAPPING',
 224167: 'not specified',
 227243: 'not specified',
 229669: 'NO MAPPING',
 220180: 'not specified',
 220051: 'arterial',
 220060: 'NO MAPPING',
 225310: 'arterial',
 228151: 'NO MAPPING',
 224643: 'not specified',
 227242: 'not specified',
 229668: 'NO MAPPING',
 223934: 'NO MAPPING',
 223943: 'NO MAPPING',
 223935: 'NO MAPPING',
 223947: 'NO MAPPING',
 223936: 'NO MAPPING',
 223948: 'NO MAPPING',
 223938: 'NO MAPPING',
 223949: 'NO MAPPING',
 223940: 'NO MAPPING',
 223939: 'NO MAPPING',
 223944: 'NO MAPPING',
 223945: 'NO MAPPING',
 223942: 'NO MAPPING',
 223941: 'NO MAPPING',
 223946: 'NO MAPPING',
 228194: 'NO MAPPING',
 229770: 'NO MAPPING',
 220045: 'not specified',
 220048: 'NO MAPPING',
 220277: 'not specified',
 223770: 'NO MAPPING',
 223769: 'NO MAPPING',
 226253: 'NO MAPPING',
 229862: 'NO MAPPING',
 220210: 'not specified',
 224689: 'MAPPED ELSEWHERE',
 224690: 

# utils

In [71]:
EXCLUDED_LABELS_DEFAULT = ["NO MAPPING", "UNSURE", "MAPPED ELSEWHERE", "SPECIAL CASE"]

# find all the relevant item ids for a table
def get_relevant_item_ids(
    mapping_df: pd.DataFrame, decision_col: str, excluded_labels: list = EXCLUDED_LABELS_DEFAULT):
    '''
    decision_col:
    '''
    return mapping_df.loc[
        ~mapping_df[decision_col].isin(excluded_labels),
        "itemid"
        ].unique()

def rename_and_reorder_cols(df, rename_mapper_dict: dict, new_col_order: list):
    return (
        df.rename(columns = rename_mapper_dict)
        .reindex(columns = new_col_order)
        )

def check_duplicates(df, additional_cols: list):
    '''
    Check whether there are duplicates -- more than one populated value -- for what is supposed to be 
    unique combination of columns. That is, for the same measured variable (e.g. vital_category) at
    the same time during the same encounter, there should be only one corresponding value.
    '''
    cols_to_check = ["encounter_id", "recorded_dttm"].extend(additional_cols)
    return df[df.duplicated(subset = cols_to_check, keep = False)]

In [7]:
def item_id_to_feature_value(item_id: int, col: str = "label", df = d_items):
    '''
    Find the corresponding feature value of an item by id.
    i.e. find the label, or linksto, or item with id 226732.
    '''
    return df.loc[df["itemid"] == item_id, col].values[0]

def item_id_to_label(item_id: int) -> str:
    '''
    Helper function that returns the "label" string of an item given its item_id. 
    '''
    return item_id_to_feature_value(item_id)

def item_id_to_events_df(item_id: int, simplify: bool = False) -> pd.DataFrame:
    '''
    Return in a pandas df all the events associated with an item id.
    - simplify: whether to return the original df (False), or a simplified one 
    with some columns (particulary timestamps) renamed to support integration 
    between different events df.  
    '''
    # find whether it is chartevents, or procedure events, etc.
    linksto_table_name = item_id_to_feature_value(item_id, col = "linksto")
    # turn string into a dj object
    linksto_df: pd.DataFrame = globals()[linksto_table_name]
    events_df = linksto_df.loc[linksto_df["itemid"] == item_id, :]
    # if does not simplify, return the original column
    if not simplify:
        return events_df
    # else, if simplified:
    elif linksto_table_name == "procedureevents": # FIXME: trach is complex and need additional attention
        events_df_simplified = events_df.loc[
            :, ['subject_id', 'hadm_id', 'stay_id', 'endtime', 'itemid', 'value', 'valueuom']
        ].rename(columns = {"endtime": "time"})
        return events_df_simplified
    elif linksto_table_name == "chartevents":
        events_df_simplified = events_df.loc[
            :, ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value', 'valueuom']
        ].rename(columns = {"charttime": "time"})
        return events_df_simplified
        
    # FIXME: likely an issue if data struct of different events table are different 

def item_ids_list_to_events_df(item_ids: list):
    df_list = [item_id_to_events_df(item_id, simplify = True) for item_id in item_ids]
    df_merged = pd.concat(df_list) #.head().assign(
        ## linksto = lambda df: df["itemid"].apply(lambda item_id: item_id_to_feature_value(item_id, col = "linksto"))
    # )
    return df_merged 
    # FIXME: automatically add the label and linksto table source columns -- create cache?

## `ItemFinder` class

In [8]:
class ItemFinder():
    def __init__(self, kw, df = d_items, col: str = "label", 
                 case: bool = False):
        '''
        Look up an item by keyword from the `d_items` table of the `icu` module.
        - df = `d_items`
        - col = {"label", "abbr"}
        '''
        self.kw = kw 
        self.df = df
        self.col = "abbreviation" if col == "abbr" else col
        self.case = case

        # df of items that match the key words
        self.kw_items_df = df[
            df[self.col].str.contains(kw, case = case, na = False)
        ]
        # list of ids for items that match the key words
        self.kw_items_ids = self.kw_items_df["itemid"].values
        # a string list of non-duplicated events table names, e.g. ["chartevents", "procedureevents"]
        self.linksto_table_names = self.kw_items_df["linksto"].unique()
                                                                
        self.kw_chartevents = chartevents.loc[
            chartevents["itemid"].isin(self.kw_items_ids),
            :
        ]
        self.item_freq = self.generate_item_freq()

        self.candidate_table = self.make_candidate_table()

    def generate_item_freq(self):
        '''
        Iterative over each events table, find the items freq therein, and combine into one df.
        '''
        freq_df_ls = [] # a list of df's
        for table_name in self.linksto_table_names:
            # fetch the object by name, i.e. chartevents, procedureevents, etc.
            events_df = globals()[table_name]
            # a df of events associated with the key word items
            kw_events_df = events_df.loc[
                events_df["itemid"].isin(self.kw_items_ids),
                :
            ]
            # a df of item freq for one event type  
            item_freq_df = kw_events_df.value_counts("itemid")

            # check if the df is empty
            if not item_freq_df.empty:
                freq_df_ls.append(item_freq_df)
        
        # check if the list len is 0, meaning 
        if len(freq_df_ls) != 0:
            # return a df of all the item freq of all events table concat-ed together
            return pd.concat(freq_df_ls)
        else: 
            return self.kw_chartevents.value_counts("itemid")

    def make_candidate_table(self):
        '''
        # TODO
        '''
        if not self.item_freq.empty:
            return (
                self.kw_items_df
                .loc[:, ["itemid", "label", "abbreviation", "linksto", "category", "unitname", "param_type"]]
                # FIXME
                .join(self.item_freq, on = "itemid", validate = "1:1")
                .sort_values(by = "count", ascending = False) 
                .assign(
                    value_instances = lambda x: x["itemid"].apply(ItemFinder.item_id_to_value_instances)
                )
            )
        else: 
            return "No matching result found."
        
    @staticmethod
    def item_id_to_value_instances(item_id: int):
        '''
        Wrapper
        '''
        label = item_id_to_feature_value(item_id, "label")

        param_type = item_id_to_feature_value(item_id, "param_type")
        
        if param_type == "Numeric":
            val_instances = ItemFinder.item_id_to_value_instances_numeric(item_id)
        elif param_type == "Text":
            val_instances = ItemFinder.item_id_to_value_instances_categorical(item_id).to_dict()
        else:
            return param_type
        print(f"item label: {label}; value instances: {str(val_instances)}")
        return str(val_instances)

    @staticmethod
    def item_id_to_value_instances_categorical(item_id: int):
        '''
        Return all the unique categories
        '''
        assoc_chartevents = chartevents.loc[chartevents["itemid"] == item_id, :]
        categories = assoc_chartevents.value_counts("value") # a pd series
        return categories
    
    @staticmethod
    def item_id_to_value_instances_numeric(item_id: int):
        '''
        Find max, min, mean of a continuous, or numeric, item.
        '''
        valuenum_col = chartevents.loc[chartevents["itemid"] == item_id, :]["valuenum"]
        val_max, val_min, val_mean = valuenum_col.max(), valuenum_col.min(), valuenum_col.mean()
        return f"Max: {val_max}, Min: {val_min}, Mean: {val_mean}"

# `vitals` table

## utils

In [75]:
vital_col_names = ["encounter_id", "recorded_dttm", "vital_name", "vital_category", "vital_value", "meas_site_name"]

vitals_temp_site_mapper_dict = {
    'Oral': 'not specified', 'Blood': 'not specified', 'Axillary': 'not specified', 
    'Rectal': 'core', 'Esophogeal': 'core', 'Temporal': 'not specified', 'Tympanic': 'core', 'NA': "not specified"}

vitals_col_rename_mapper_dict = {
    "hadm_id": "encounter_id", 
    "time": "recorded_dttm",
    "value": "vital_value"
    }

def convert_f_to_c(temp_f):
    if isinstance(temp_f, str) or isinstance(temp_f, int):
        temp_f = float(temp_f) 
    
    if isinstance(temp_f, float):
        temp_c = (temp_f - 32) * 5 / 9
        return round(temp_c, 1) # so 39.3333 -> 39.3
    else:
        raise("wrong type")

## regular cases

### new approach

In [50]:
# find vital_items_ids
vitals_items_ids = get_relevant_item_ids(mapping_df = vitals_mapping, decision_col="meas_site_name")
vitals_events = item_ids_list_to_events_df(vitals_items_ids)
vitals_events

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom
0,10000032,29079034,39553978,2180-07-23 21:01:00,220179,82,mmHg
4,10000032,29079034,39553978,2180-07-23 22:00:00,220179,85,mmHg
10,10000032,29079034,39553978,2180-07-23 19:00:00,220179,93,mmHg
21,10000032,29079034,39553978,2180-07-23 20:00:00,220179,90,mmHg
163,10000032,29079034,39553978,2180-07-23 14:11:00,220179,84,mmHg
...,...,...,...,...,...,...,...
313446049,19995595,21784060,34670930,2126-10-22 04:00:00,225312,64,mmHg
313446200,19995595,21784060,34670930,2126-10-22 06:00:00,225312,97,mmHg
313452288,19995595,21784060,34670930,2126-10-21 21:00:00,225312,62,mmHg
313452316,19995595,21784060,34670930,2126-10-21 23:00:00,225312,71,mmHg


In [58]:
vitals_events["vital_name"] = vitals_events["itemid"].apply(lambda x: vital_name_mapper_dict[x])
vitals_events["vital_category"] = vitals_events["itemid"].apply(lambda x: vital_category_mapper_dict[x])
vitals_events["meas_site_name"] = vitals_events["itemid"].apply(lambda x: vital_site_mapper_dict[x])
# FIXME: efficiency

In [68]:
vitals_events

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,vital_name,vital_category,meas_site_name
0,10000032,29079034,39553978,2180-07-23 21:01:00,220179,82,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
4,10000032,29079034,39553978,2180-07-23 22:00:00,220179,85,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
10,10000032,29079034,39553978,2180-07-23 19:00:00,220179,93,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
21,10000032,29079034,39553978,2180-07-23 20:00:00,220179,90,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
163,10000032,29079034,39553978,2180-07-23 14:11:00,220179,84,mmHg,Non Invasive Blood Pressure systolic,sbp,not specified
...,...,...,...,...,...,...,...,...,...,...
313446049,19995595,21784060,34670930,2126-10-22 04:00:00,225312,64,mmHg,ART BP Mean,map,arterial
313446200,19995595,21784060,34670930,2126-10-22 06:00:00,225312,97,mmHg,ART BP Mean,map,arterial
313452288,19995595,21784060,34670930,2126-10-21 21:00:00,225312,62,mmHg,ART BP Mean,map,arterial
313452316,19995595,21784060,34670930,2126-10-21 23:00:00,225312,71,mmHg,ART BP Mean,map,arterial


In [76]:
vitals_final = rename_and_reorder_cols(vitals_events, vitals_col_rename_mapper_dict, vital_col_names)
vitals_final

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,29079034,2180-07-23 21:01:00,Non Invasive Blood Pressure systolic,sbp,82,not specified
4,29079034,2180-07-23 22:00:00,Non Invasive Blood Pressure systolic,sbp,85,not specified
10,29079034,2180-07-23 19:00:00,Non Invasive Blood Pressure systolic,sbp,93,not specified
21,29079034,2180-07-23 20:00:00,Non Invasive Blood Pressure systolic,sbp,90,not specified
163,29079034,2180-07-23 14:11:00,Non Invasive Blood Pressure systolic,sbp,84,not specified
...,...,...,...,...,...,...
313446049,21784060,2126-10-22 04:00:00,ART BP Mean,map,64,arterial
313446200,21784060,2126-10-22 06:00:00,ART BP Mean,map,97,arterial
313452288,21784060,2126-10-21 21:00:00,ART BP Mean,map,62,arterial
313452316,21784060,2126-10-21 23:00:00,ART BP Mean,map,71,arterial


### validation over duplicates

In [77]:
check_duplicates(vitals_final, ["vital_category", "vital_value"] )

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name


### old approach

In [9]:
def map_one_vitals_item(item_id: int) -> pd.DataFrame:
    '''
    create the rows in a vitals table corresponding to one item from MIMIC, based on item id.

    it is a simple procedure that does not even need reshaping.
    '''
    # label => *_name 
    vital_name = item_id_to_feature_value(item_id, "label")
    vital_category = item_id_to_feature_value(item_id, "vital_category", df = vitals_mapping)
    meas_site_name = item_id_to_feature_value(item_id, "meas_site_name", df = vitals_mapping)

    df = (
        item_id_to_events_df(item_id)
        # find all the relevant columns of the events table to be transformed
        .loc[:,["hadm_id", "charttime", "itemid", "value"]]
        .rename(columns = {
            "hadm_id": "encounter_id", 
            "charttime": "recorded_dttm",
            "value": "vital_value"
            })
        .assign(
            vital_name = vital_name,
            vital_category = vital_category,
            meas_site_name = meas_site_name
        )
        # reorder the columns
        .reindex(
            columns = ["encounter_id", "recorded_dttm", "vital_name", "vital_category", "vital_value", "meas_site_name"]
        )    
    )
    return df

class MimicMapper():
    def __init__(self) -> None:
        pass    

In [10]:
for item_id in vital_items_ids:
    # clif df for one mimic item
    vitals_df = map_one_vitals_item(item_id)
    vitals_dfs_list.append(vitals_df)

vitals_dfs = pd.concat(vitals_dfs_list)
# TODO: remove the mimic item temp c or temp f from this df

In [11]:
vitals_dfs

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,29079034,2180-07-23 21:01:00,Non Invasive Blood Pressure systolic,sbp,82,not specified
4,29079034,2180-07-23 22:00:00,Non Invasive Blood Pressure systolic,sbp,85,not specified
10,29079034,2180-07-23 19:00:00,Non Invasive Blood Pressure systolic,sbp,93,not specified
21,29079034,2180-07-23 20:00:00,Non Invasive Blood Pressure systolic,sbp,90,not specified
163,29079034,2180-07-23 14:11:00,Non Invasive Blood Pressure systolic,sbp,84,not specified
...,...,...,...,...,...,...
313446049,21784060,2126-10-22 04:00:00,ART BP Mean,map,64,arterial
313446200,21784060,2126-10-22 06:00:00,ART BP Mean,map,97,arterial
313452288,21784060,2126-10-21 21:00:00,ART BP Mean,map,62,arterial
313452316,21784060,2126-10-21 23:00:00,ART BP Mean,map,71,arterial


## `temp_c` special case

In [41]:
temp_events = item_ids_list_to_events_df([223761, 223762, 224642])
temp_events

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom
17,10000032,29079034,39553978,2180-07-23 20:00:00,223761,99.5,°F
162,10000032,29079034,39553978,2180-07-23 14:00:00,223761,98.7,°F
371,10000032,29079034,39553978,2180-07-23 17:00:00,223761,98.7,°F
550,10000980,26913865,39765666,2189-06-27 09:07:00,223761,98,°F
730,10000980,26913865,39765666,2189-06-27 12:00:00,223761,97.7,°F
...,...,...,...,...,...,...,...
313644553,19999987,23865745,36195440,2145-11-03 04:00:00,224642,Oral,
313644735,19999987,23865745,36195440,2145-11-02 23:24:00,224642,Oral,
313644811,19999987,23865745,36195440,2145-11-02 23:41:00,224642,Oral,
313644876,19999987,23865745,36195440,2145-11-04 20:00:00,224642,Oral,


In [12]:
# pivot directly
temp_wider = temp_events.pivot(
    index = ["hadm_id", "charttime"], 
    columns = "itemid",
    values = "value"
    ).reset_index()
temp_wider

itemid,hadm_id,charttime,223761,223762,224642
0,20000094,2150-03-02 15:30:00,97.5,,Oral
1,20000094,2150-03-02 17:00:00,97.5,,Oral
2,20000094,2150-03-02 20:00:00,96.3,,Axillary
3,20000094,2150-03-03 04:00:00,99,,Rectal
4,20000147,2121-08-30 22:11:00,97.6,,Oral
...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,99.1,,Oral
1920381,29999828,2180-10-29 09:00:00,98.2,,Oral
1920382,29999828,2180-10-29 12:00:00,99.5,,Oral
1920383,29999828,2180-10-29 14:00:00,99,,Oral


In [16]:
# map temp_site to the clif categories of meas_site_name
temp_wider["meas_site_name"] = temp_wider[224642].apply(
    lambda x: vitals_temp_site_mapper_dict[x] if pd.notna(x) else "not specified"
)

# 223761 = temp in f, 223762 = temp in c
temp_wider["vital_value"] = temp_wider[223762].fillna(
    temp_wider[223761].apply(convert_f_to_c)
    )

temp_wider['vital_name'] = temp_wider.apply(
    lambda row: "Temperature Celsius" if pd.notna(row[223762]) else "Temperature Fahrenheit", 
    axis = "columns"
    )

temp_wider["vital_category"] = "temp_c"

In [64]:
temp_final = rename_and_reorder_cols(temp_wider, vitals_col_rename_mapper_dict, vital_col_names)
temp_final

itemid,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,20000094,2150-03-02 15:30:00,Temperature Fahrenheit,temp_c,36.4,not specified
1,20000094,2150-03-02 17:00:00,Temperature Fahrenheit,temp_c,36.4,not specified
2,20000094,2150-03-02 20:00:00,Temperature Fahrenheit,temp_c,35.7,not specified
3,20000094,2150-03-03 04:00:00,Temperature Fahrenheit,temp_c,37.2,core
4,20000147,2121-08-30 22:11:00,Temperature Fahrenheit,temp_c,36.4,not specified
...,...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,Temperature Fahrenheit,temp_c,37.3,not specified
1920381,29999828,2180-10-29 09:00:00,Temperature Fahrenheit,temp_c,36.8,not specified
1920382,29999828,2180-10-29 12:00:00,Temperature Fahrenheit,temp_c,37.5,not specified
1920383,29999828,2180-10-29 14:00:00,Temperature Fahrenheit,temp_c,37.2,not specified


In [39]:
temp_final = (
    temp_wider
    .rename(columns=vitals_col_rename_mapper_dict)
    .reindex(columns = vital_col_names)
)

temp_final

itemid,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
0,20000094,2150-03-02 15:30:00,Temperature Fahrenheit,temp_c,36.4,not specified
1,20000094,2150-03-02 17:00:00,Temperature Fahrenheit,temp_c,36.4,not specified
2,20000094,2150-03-02 20:00:00,Temperature Fahrenheit,temp_c,35.7,not specified
3,20000094,2150-03-03 04:00:00,Temperature Fahrenheit,temp_c,37.2,core
4,20000147,2121-08-30 22:11:00,Temperature Fahrenheit,temp_c,36.4,not specified
...,...,...,...,...,...,...
1920380,29999828,2180-10-29 08:00:00,Temperature Fahrenheit,temp_c,37.3,not specified
1920381,29999828,2180-10-29 09:00:00,Temperature Fahrenheit,temp_c,36.8,not specified
1920382,29999828,2180-10-29 12:00:00,Temperature Fahrenheit,temp_c,37.5,not specified
1920383,29999828,2180-10-29 14:00:00,Temperature Fahrenheit,temp_c,37.2,not specified


### validation over duplicates

In [72]:
check_duplicates(temp_final, ["vital_category",	"vital_value"] )

itemid,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name


## merge and save

In [None]:
# COMBINE:
# remove temp_c from the previous version
vitals_df_clean = vitals_dfs.query('vital_category != "temp_c"')

vitals_df_final = pd.concat([
    vitals_df_clean, temp_final
])

In [126]:
# save
vitals_df_final.to_parquet('../rclif/rclif_vitals.parquet')

In [77]:
# load
rclif_vitals = pd.read_parquet('../rclif/rclif_vitals.parquet')

# `respiratory_support` table

## utils

In [13]:
resp_item_ids = get_relevant_item_ids(mapping_df = resp_mapping, decision_col = "variable")
resp_item_ids

array([220210, 226732, 223835, 220339, 224685, 224687, 224695, 223834,
       223848, 224690, 223849, 224688, 224684, 224738, 224686, 224701,
       224696, 229314, 224691, 224700, 225448, 226237, 227287, 227577,
       227579, 227580, 227582, 224702, 227581])

In [93]:
resp_events: pd.DataFrame = item_ids_list_to_events_df(resp_item_ids)
resp_events.head()

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom
7,10000032,29079034,39553978,2180-07-23 22:00:00,220210,20,insp/min
13,10000032,29079034,39553978,2180-07-23 19:00:00,220210,16,insp/min
24,10000032,29079034,39553978,2180-07-23 20:00:00,220210,19,insp/min
62,10000032,29079034,39553978,2180-07-23 21:00:00,220210,22,insp/min
167,10000032,29079034,39553978,2180-07-23 14:12:00,220210,24,insp/min


In [13]:
resp_stay_ids = resp_events.value_counts("stay_id")
resp_stay_ids

stay_id
36237605    22331
30763434    16455
32863488    13035
31879957    11395
32380519    10383
            ...  
39221076        1
31630098        1
35741569        1
38430956        1
32690681        1
Name: count, Length: 73148, dtype: int64

In [14]:
resp_columns = [
    "encounter_id", "recorded_dttm", "device_name", "device_category", 
    "mode_name", "mode_category", "tracheostomy", "fio2_set", "lpm_set",
    "tidal_volume_set", "resp_rate_set", "pressure_control_set", "pressure_support_set",
    "flow_rate_set", "peak_inspiratory_pressure_set", "inspiratory_time_set",
    "peep_set", "tidal_volume_obs", "resp_rate_obs", "plateau_pressure_obs",
    "peak_inspiratory_pressure_obs", "peep_obs", "minute_vent_obs"
    ]

In [100]:
resp_device_rank = ["Vent", "NIPPV", "CPAP", "High Flow NC", "Face Mask", "Trach Collar", "Nasal Cannula", "Room Air", "Other"]
resp_device_rank.index("Vent")

# checking duplicates given the duplicate index error
def find_resp_duplicates(stay_events: pd.DataFrame) -> pd.DataFrame:
    '''
    Find duplicates -- multiple instances of the same item at the same time -- that prohibit pivoting.
    '''
    duplicates = stay_events[stay_events.duplicated(subset=["hadm_id", "time", "itemid"], keep=False)]
    # if there is no duplicates
    if duplicates.empty:
        return pd.DataFrame()
    else:
        return duplicates.assign(
            category = duplicates["value"].apply(lambda x: resp_device_mapper_dict[x.strip()]),
            rank = lambda df: df["category"].apply(lambda x: resp_device_rank.index(x.strip()))
            )

In [123]:
def map_one_stay(stay_id, table_events: pd.DataFrame):
    '''
    find all the relevant items within one stay, and match by charttime.
    
    - table_events: a df of all the events corresponding to that table.
    '''      
    stay_events: pd.DataFrame = table_events.loc[
        table_events["stay_id"] == stay_id, :
    ]
    
    #return stay_events
    # use helper func to check for duplicates
    duplicates = find_resp_duplicates(stay_events)
    # print(duplicates)
    # if there are duplicates, first remove them from the same `stay_events` df
    if not duplicates.empty:
        top_ranked_indices = duplicates.groupby(["hadm_id", "time", "itemid"])["rank"].idxmin()
        # non top-ranked categories to be dropped
        non_top_ranked_indices = duplicates.index.difference(top_ranked_indices)
        stay_events.drop(non_top_ranked_indices, inplace = True)
    
    stay_events = stay_events.assign(
        label = lambda df: df["itemid"].apply(item_id_to_label),
        variable = lambda df: df["itemid"].apply(lambda x: resp_mapper_dict[x])
    )

    return stay_events # return this for now, TODO: incorporate more after the code is tested

    wider = stay_events.pivot(
        index = ["hadm_id", "time"], 
        columns = ["itemid"],
        values = "value" # keep this just a str, not a list, to avoid having double col index
    ).reset_index()
    
    return wider # return this for now, TODO: incorporate more when the code is tested

## EDA

In [76]:
# for building the map_one_stay function
eg_stay_id1 = 39765666  
eg_stay_id2 = 32863488 
eg_stay_id3 = 30763434 # no 2, for checking duplicates groupby
eg_stay_id4 = 36237605 # no 1 
eg_stay_id = 37750813

eg_hadm_id = 39765666

In [124]:
# eg_stay2 = map_one_stay(eg_stay_id2, resp_events)
eg_stay3 = map_one_stay(eg_stay_id3, resp_events)
# eg_stay4 = map_one_stay(eg_stay_id4, resp_events)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stay_events.drop(non_top_ranked_indices, inplace = True)


In [125]:
eg_stay3

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom,label,variable
209228581,16668648,23874681,30763434,2149-10-26 07:00:00,220210,19,insp/min,Respiratory Rate,resp_rate_obs
209228665,16668648,23874681,30763434,2149-10-26 09:00:00,220210,17,insp/min,Respiratory Rate,resp_rate_obs
209228674,16668648,23874681,30763434,2149-10-26 10:00:00,220210,21,insp/min,Respiratory Rate,resp_rate_obs
209228688,16668648,23874681,30763434,2149-10-26 11:00:00,220210,17,insp/min,Respiratory Rate,resp_rate_obs
209228712,16668648,23874681,30763434,2149-10-26 12:00:00,220210,16,insp/min,Respiratory Rate,resp_rate_obs
...,...,...,...,...,...,...,...,...,...
209295505,16668648,23874681,30763434,2149-10-16 04:00:00,224700,11,cmH2O,Total PEEP Level,peep_obs
209227816,16668648,23874681,30763434,2149-10-10 09:07:00,224702,10,cmH2O,PCV Level,pressure_control_set
209228267,16668648,23874681,30763434,2149-10-13 13:11:00,224702,15,cmH2O,PCV Level,pressure_control_set
209289903,16668648,23874681,30763434,2149-10-06 03:55:00,224702,10,cmH2O,PCV Level,pressure_control_set


In [126]:
eg_wider = eg_stay3.pivot(
        index = ["hadm_id", "time"], 
        columns = ["variable", "label"],
        values = "value" # keep this just a str, not a list, to avoid having double col index
    ).reset_index()
# RESUME HERE TODO: complete all the columns first -- line up all the columns
# might want to simply find the overlapping cols?
# then map mode and device columns
# TODo: merge two columns where one is NA
eg_wider

variable,hadm_id,time,resp_rate_obs,device_name,fio2_set,peep_set,tidal_volume_obs,minute_vent_obs,peak_inspiratory_pressure_obs,lpm_set,...,resp_rate_set,tidal_volume_set,inspiratory_time_set,tidal_volume_obs,pressure_support_set,plateau_pressure_obs,mode_name,flow_rate_set,peep_obs,pressure_control_set
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Respiratory Rate,O2 Delivery Device(s),Inspired O2 Fraction,PEEP set,Tidal Volume (observed),Minute Volume,Peak Insp. Pressure,O2 Flow,...,Respiratory Rate (Set),Tidal Volume (set),Inspiratory Time,Tidal Volume (spontaneous),PSV Level,Plateau Pressure,Ventilator Mode (Hamilton),Flow Rate (L/min),Total PEEP Level,PCV Level
0,23874681,2149-09-25 21:15:00,12,,,,,,,,...,,,,,,,,,,
1,23874681,2149-09-25 22:00:00,20,,,,,,,,...,,,,,,,,,,
2,23874681,2149-09-25 23:00:00,19,,,,,,,,...,,,,,,,,,,
3,23874681,2149-09-26 00:00:00,20,,,,,,,,...,,,,,,,,,,
4,23874681,2149-09-26 01:00:00,20,,,,,,,,...,,,,,,,,,,


In [141]:
eg_wider.columns.value_counts()

variable                       label                     
device_name                    O2 Delivery Device(s)         1
                               Ventilator Type               1
fio2_set                       Inspired O2 Fraction          1
flow_rate_set                  Flow Rate (L/min)             1
hadm_id                                                      1
inspiratory_time_set           Inspiratory Time              1
lpm_set                        O2 Flow                       1
minute_vent_obs                Minute Volume                 1
mode_name                      Ventilator Mode               1
                               Ventilator Mode (Hamilton)    1
peak_inspiratory_pressure_obs  Peak Insp. Pressure           1
peep_obs                       Total PEEP Level              1
peep_set                       PEEP set                      1
plateau_pressure_obs           Plateau Pressure              1
pressure_control_set           PCV Level                    

In [140]:
multiindex_levels = list(eg_wider.columns)

# Sort the columns based on the first level of the MultiIndex
sorted_multiindex = sorted(multiindex_levels, key=lambda x: (x[0], x[1]))

# Reorder the DataFrame columns based on the sorted order
eg_wider_reordered = eg_wider[sorted_multiindex]
eg_wider_reordered3

variable,device_name,device_name,fio2_set,flow_rate_set,hadm_id,inspiratory_time_set,lpm_set,minute_vent_obs,mode_name,mode_name,...,plateau_pressure_obs,pressure_control_set,pressure_support_set,resp_rate_obs,resp_rate_obs,resp_rate_set,tidal_volume_obs,tidal_volume_obs,tidal_volume_set,time
label,O2 Delivery Device(s),Ventilator Type,Inspired O2 Fraction,Flow Rate (L/min),Unnamed: 5_level_1,Inspiratory Time,O2 Flow,Minute Volume,Ventilator Mode,Ventilator Mode (Hamilton),...,Plateau Pressure,PCV Level,PSV Level,Respiratory Rate,Respiratory Rate (Total),Respiratory Rate (Set),Tidal Volume (observed),Tidal Volume (spontaneous),Tidal Volume (set),Unnamed: 21_level_1
0,,,,,23874681,,,,,,...,,,,12,,,,,,2149-09-25 21:15:00
1,,,,,23874681,,,,,,...,,,,20,,,,,,2149-09-25 22:00:00
2,,,,,23874681,,,,,,...,,,,19,,,,,,2149-09-25 23:00:00
3,,,,,23874681,,,,,,...,,,,20,,,,,,2149-09-26 00:00:00
4,,,,,23874681,,,,,,...,,,,20,,,,,,2149-09-26 01:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3966,Trach mask,,50,,23874681,,10,,,,...,,,,17,,,,,,2149-10-31 12:00:00
3967,,,,,23874681,,,,,,...,,,,13,,,,,,2149-10-31 13:00:00
3968,,,,,23874681,,,,,,...,,,,28,,,,,,2149-10-31 14:00:00
3969,Tracheostomy tube,Hamilton,40,58.5,23874681,0.9,,11.6,,APV (cmv),...,16,,,14,25,20,399,,500,2149-10-31 15:00:00


In [135]:
type(sorted_multiindex)

list

In [139]:
dups_of_wider = eg_wider_reordered.columns[eg_wider_reordered.columns.duplicated()]
dups_of_wider
eg_wider_reordered.loc[:,eg_wider_reordered.columns.duplicated()]

MultiIndex([], names=['variable', 'label'])

In [None]:
ind = eg_wider_renamed["device_name"].dropna(subset=["device_name", "device_name"]).index
device_dups = eg_wider_renamed.loc[ind,:]
device_dups

itemid,hadm_id,time,resp_rate_obs,peep_set,lpm_set,fio2_set,device_name,mode_name,tidal_volume_set,tidal_volume_obs,...,resp_rate_obs.1,flow_rate_set,peak_inspiratory_pressure_obs,plateau_pressure_obs,peep_obs,pressure_support_set,pressure_control_set,inspiratory_time_set,device_name.1,mode_name.1
54,23874681,2149-09-27 15:00:00,18,5,,40,Hamilton,,430,386,...,18,55.4,12,11,5,,,0.9,Endotracheal tube,APV (cmv)
60,23874681,2149-09-27 20:00:00,20,5,,30,Hamilton,,430,425,...,20,46.3,15,12,5,,,0.9,Endotracheal tube,APV (cmv)
81,23874681,2149-09-28 04:00:00,20,5,,30,Hamilton,,430,413,...,20,43.1,14,12,5,,,0.9,Endotracheal tube,APV (cmv)
104,23874681,2149-09-28 08:30:00,19,5,,30,Hamilton,,430,553,...,20,39.2,13,10,7,,,0.9,Endotracheal tube,APV (cmv)
114,23874681,2149-09-28 12:00:00,14,5,,30,Hamilton,,,741,...,15,65.2,14,,,8,,,Endotracheal tube,SPONT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3952,23874681,2149-10-30 23:00:00,20,8,,40,Hamilton,,500,504,...,19,50.6,18,16,,,,0.9,Endotracheal tube,APV (cmv)
3956,23874681,2149-10-31 03:00:00,16,,,40,Hamilton,,500,543,...,20,49.4,,,,,,,Endotracheal tube,APV (cmv)
3961,23874681,2149-10-31 07:00:00,,8,,40,Hamilton,,500,463,...,21,49.7,17,15,8,,,0.9,Tracheostomy tube,APV (cmv)
3964,23874681,2149-10-31 10:00:00,20,8,,40,Hamilton,,,678,...,15,55,18,,,8,,,Tracheostomy tube,SPONT


In [None]:
# PREVIOUS: to find the common cols of different events tables
chart_events_eg = item_id_to_events_df(227287)
proc_events_eg = item_id_to_events_df(226237)

In [None]:
print(chart_events_eg.columns,"\n",proc_events_eg.columns)
chart_events_eg.columns.intersection(proc_events_eg.columns)

Index(['subject_id', 'hadm_id', 'stay_id', 'caregiver_id', 'charttime',
      dtype='object') 
 Index(['subject_id', 'hadm_id', 'stay_id', 'caregiver_id', 'starttime',
       'endtime', 'storetime', 'itemid', 'value', 'valueuom', 'location',
       'locationcategory', 'orderid', 'linkorderid', 'ordercategoryname',
       'ordercategorydescription', 'patientweight', 'isopenbag',
       'continueinnextdept', 'statusdescription', 'originalamount',
       'originalrate'],
      dtype='object')


Index(['subject_id', 'hadm_id', 'stay_id', 'caregiver_id', 'storetime',
       'itemid', 'value', 'valueuom'],
      dtype='object')

In [None]:
table_events_eg = item_ids_list_to_events_df([220210])
table_events_eg.head()

Unnamed: 0,subject_id,hadm_id,stay_id,time,itemid,value,valueuom
7,10000032,29079034,39553978,2180-07-23 22:00:00,220210,20,insp/min
13,10000032,29079034,39553978,2180-07-23 19:00:00,220210,16,insp/min
24,10000032,29079034,39553978,2180-07-23 20:00:00,220210,19,insp/min
62,10000032,29079034,39553978,2180-07-23 21:00:00,220210,22,insp/min
167,10000032,29079034,39553978,2180-07-23 14:12:00,220210,24,insp/min


In [None]:
# all the events for item: O2 Delivery Device(s)
item_id_to_events_df(226732)

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
38,10000032,29079034,39553978,66056.0,2180-07-23 20:00:00,2180-07-23 21:13:00,226732,Nasal cannula,,,0.0
192,10000032,29079034,39553978,88981.0,2180-07-23 14:20:00,2180-07-23 14:20:00,226732,Nasal cannula,,,0.0
412,10000032,29079034,39553978,88981.0,2180-07-23 18:00:00,2180-07-23 18:24:00,226732,Nasal cannula,,,0.0
515,10000980,26913865,39765666,26402.0,2189-06-27 10:00:00,2189-06-27 10:01:00,226732,Nasal cannula,,,1.0
1039,10001217,24597018,37067082,7355.0,2157-11-21 20:00:00,2157-11-21 21:29:00,226732,Nasal cannula,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313644011,19999987,23865745,36195440,74793.0,2145-11-04 09:00:00,2145-11-04 08:59:00,226732,Face tent,,,1.0
313644012,19999987,23865745,36195440,74793.0,2145-11-04 09:00:00,2145-11-04 08:59:00,226732,High flow neb,,,1.0
313644048,19999987,23865745,36195440,74793.0,2145-11-04 10:00:00,2145-11-04 13:57:00,226732,Nasal cannula,,,0.0
313644814,19999987,23865745,36195440,91429.0,2145-11-03 11:30:00,2145-11-03 11:59:00,226732,Endotracheal tube,,,0.0


In [None]:
resp_events = (
    chartevents.loc[chartevents["stay_id"] == 39765666, :]
    # keep only items that are in the resp category
    .loc[lambda x: x["itemid"].isin(resp_item_ids)]
    # add a column of item label
    .assign(item_label = lambda x: x["itemid"].apply(item_id_to_feature_value))
    .reindex(
        columns = ["stay_id", "charttime", "itemid", "item_label", "value", "valuenum", "valueuom", "warning"]
    )
)

In [None]:
resp_events.loc[
    resp_events["charttime"] == "2189-06-27 09:00:00",:
]

Unnamed: 0,stay_id,charttime,itemid,item_label,value,valuenum,valueuom,warning
489,39765666,2189-06-27 09:00:00,227287,O2 Flow (additional cannula),40,40.0,L/min,0.0
490,39765666,2189-06-27 09:00:00,220339,PEEP set,5,5.0,cmH2O,0.0
491,39765666,2189-06-27 09:00:00,223848,Ventilator Type,Drager,1.0,,0.0
492,39765666,2189-06-27 09:00:00,224685,Tidal Volume (observed),393,393.0,mL,0.0
493,39765666,2189-06-27 09:00:00,224686,Tidal Volume (spontaneous),528,528.0,mL,0.0
494,39765666,2189-06-27 09:00:00,224687,Minute Volume,11.2,11.2,L/min,0.0
496,39765666,2189-06-27 09:00:00,224690,Respiratory Rate (Total),27,27.0,insp/min,0.0
498,39765666,2189-06-27 09:00:00,224695,Peak Insp. Pressure,11,11.0,cmH2O,0.0
500,39765666,2189-06-27 09:00:00,224701,PSV Level,5,5.0,cmH2O,0.0
502,39765666,2189-06-27 09:00:00,224738,Inspiratory Time,1,1.0,sec,0.0


## MAPPING