In [1]:
import pandas as pd
import polars as pl
from timeit import timeit
import os, sys
print(os.getcwd())
os.chdir('/gpfs/data/healthcare-allocate/CLIF-MIMIC/code')
print(os.getcwd())

proj_root = "/gpfs/data/healthcare-allocate/CLIF-MIMIC"
if proj_root not in sys.path:
    sys.path.append(proj_root)

from code.custom_utils import *

/gpfs/data/healthcare-allocate/CLIF-MIMIC
/gpfs/data/healthcare-allocate/CLIF-MIMIC/code


# utils

In [None]:
class ItemFinder():
    def __init__(self, kw: str, df = d_items, col: str = "label", 
                 case: bool = False, clean: bool = True):
        '''
        Look up an item by keyword from the `d_items` table of the `icu` module.
        - df = `d_items`
        - col = {"label", "abbr"}
        '''
        self.kw = kw
        self.df = df
        self.col = "abbreviation" if col == "col" else col
        self.case = case
        self.clean = clean
        
        self.candidate_items = df[
            df[self.col].str.contains(kw, case = case, na = False)
        ]
        self.item_ids = self.candidate_items["itemid"].values
        self.assoc_chartevents = chartevents.loc[
            chartevents["itemid"].isin(self.item_ids),
            :
        ]
        self.item_freq = self.assoc_chartevents.value_counts("itemid")

        self.candidate_items_clean = (
            self.candidate_items
            .loc[:, ["itemid", "label", "abbreviation", "category", "unitname", "param_type"]]
            .join(self.item_freq, on = "itemid", validate = "1:1")
            .sort_values(by = "count", ascending = False)   
        )

    def create_candidate_table(self, clean = clean): # FIXME: when to display the clean version
        return self.candidate_items_clean

In [76]:
def lookup_item(kw: str, df = d_items, col: str = "label", case: bool = False, cleaner: bool = False):

    # original output from d_items
    if not cleaner:
        return out
    # cleaner output to be pasted into spreadsheet
    else:
        cleaner_out = (
            out
            .loc[:, ["itemid", "label", "abbreviation", "category", "unitname", "param_type"]]
            # .reset_index(drop = True)
        )
        full_out = (
            cleaner_out

        )
        return full_out

# `hosp`

In [2]:
d_labitems = pd.read_csv("mimic-iv-2.2/hosp/d_labitems.csv.gz")

In [4]:
# d_labitems.head(10)
contains = d_labitems["label"].str.contains("temp", case=False, na=False)
d_labitems[contains]

Unnamed: 0,itemid,label,fluid,category
23,50825,Temperature,Blood,Blood Gas


In [19]:
d_labitems["fluid"].unique()
d_labitems["category"].unique()

array(['Blood Gas', 'Chemistry', 'Hematology'], dtype=object)

In [12]:
omr = pd.read_csv("../mimic-iv-2.2/hosp/omr.csv.gz")
omr.head(10)

Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
0,10000032,2180-04-27,1,Blood Pressure,110/65
1,10000032,2180-04-27,1,Weight (Lbs),94
2,10000032,2180-05-07,1,BMI (kg/m2),18.0
3,10000032,2180-05-07,1,Height (Inches),60
4,10000032,2180-05-07,1,Weight (Lbs),92.15
5,10000032,2180-05-07,2,Weight (Lbs),92.15
6,10000032,2180-05-07,3,Weight (Lbs),92.15
7,10000032,2180-05-07,4,Weight (Lbs),92.15
8,10000032,2180-05-07,5,Weight (Lbs),92.15
9,10000032,2180-05-07,6,Weight (Lbs),92.15


# `icu`

### `d_items`

In [2]:
# load the table
d_items = pd.read_csv("../mimic-iv-2.2/icu/d_items.csv.gz")

In [3]:
lookup_item(d_items, "Temperature", cleaner=True)

Unnamed: 0,itemid,label,abbreviation,category
0,223761,Temperature Fahrenheit,Temperature F,Routine Vital Signs
1,223762,Temperature Celsius,Temperature C,Routine Vital Signs
2,224027,Skin Temperature,Skin Temp,Skin - Assessment
3,224642,Temperature Site,Temp Site,Routine Vital Signs
4,224674,Changes in Temperature,Changes in Temperature,Toxicology
5,226329,Blood Temperature CCO (C),Blood Temp CCO (C),Routine Vital Signs
6,227054,TemperatureF_ApacheIV,TemperatureF_ApacheIV,Scores - APACHE IV (2)
7,228242,Pt. Temperature (BG) (SOFT),Pt. Temperature (BG) (SOFT),Labs
8,229236,Cerebral Temperature (C),Cerebral T (C),Hemodynamics


### `chartevents`

In [4]:
t1 = timeit(
    lambda: pd.read_csv("mimic-iv-2.2/icu/chartevents.csv.gz", nrows=100000), number = 10
)

t2 = timeit(
    lambda: pl.read_csv("mimic-iv-2.2/icu/chartevents.csv.gz", n_rows=100000), number = 10
)

print(t1,t2)

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.
1.3999643280403689 1.177795070107095


In [3]:
# resave into parquet
chartevents = load_gz_csv("mimic-iv-2.2/icu/chartevents.csv.gz")
chartevents.to_parquet('mimic-iv-2.2/icu/chartevents.parquet')  

In [4]:
# eventual, much faster load:
chartevents = pd.read_parquet('../mimic-iv-2.2/icu/chartevents.parquet')

In [43]:
def find_item_label(item_id: int) -> str:
    '''
    Return the "label" string given an item_id string 
    '''
    return d_items.loc[d_items["itemid"] == item_id, "label"].values[0]

itemid_to_meas_site_name = {
    220050: "arterial",
    220179: "not specified"
}

In [44]:
find_item_label(220050)

'Arterial Blood Pressure systolic'

In [32]:
chartevents.loc[
    chartevents["itemid"] == 220050, :]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
19182,10002013,23581541,39060235,6768.0,2160-05-18 20:00:00,2160-05-18 21:16:00,220050,116,116.0,mmHg,0.0
19327,10002013,23581541,39060235,6768.0,2160-05-18 21:00:00,2160-05-18 21:16:00,220050,97,97.0,mmHg,0.0
19353,10002013,23581541,39060235,6768.0,2160-05-18 22:00:00,2160-05-18 22:08:00,220050,123,123.0,mmHg,0.0
19376,10002013,23581541,39060235,6768.0,2160-05-18 22:10:00,2160-05-18 22:11:00,220050,94,94.0,mmHg,0.0
19385,10002013,23581541,39060235,6768.0,2160-05-18 23:00:00,2160-05-18 23:01:00,220050,116,116.0,mmHg,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313630748,19999442,26785317,32336619,59028.0,2148-11-20 06:00:00,2148-11-20 06:26:00,220050,124,124.0,mmHg,0.0
313630778,19999442,26785317,32336619,59028.0,2148-11-20 20:00:00,2148-11-20 21:35:00,220050,104,104.0,mmHg,0.0
313630874,19999442,26785317,32336619,59028.0,2148-11-20 21:00:00,2148-11-20 21:35:00,220050,102,102.0,mmHg,0.0
313630901,19999442,26785317,32336619,59028.0,2148-11-20 22:00:00,2148-11-20 23:48:00,220050,105,105.0,mmHg,0.0


In [None]:
def map_one_item(item_id):
    '''
    create the rows in a vitals table corresponding to one item from MIMIC, based on item id.
    '''


In [47]:
item_id = 220050
vital_name = find_item_label(item_id)
vital_category = "sbp"
meas_site_name = itemid_to_meas_site_name[item_id]

vitals_sbp = (
    chartevents
    .loc[
        chartevents["itemid"] == item_id,  # :
        ["hadm_id", "charttime", "itemid", "value"]
    ]
    .rename(columns = {
        "hadm_id": "encounter_id", 
        "charttime": "recorded_dttm",
        "value": "vital_value"
        })
    .assign(
        vital_name = vital_name,
        vital_category = vital_category,
        meas_site_name = meas_site_name
    )
    .reindex(
        columns = ["encounter_id", "recorded_dttm", "vital_name", "vital_category", "vital_value", "meas_site_name"]
    )
)


In [48]:
vitals_sbp

Unnamed: 0,encounter_id,recorded_dttm,vital_name,vital_category,vital_value,meas_site_name
19182,23581541,2160-05-18 20:00:00,Arterial Blood Pressure systolic,sbp,116,arterial
19327,23581541,2160-05-18 21:00:00,Arterial Blood Pressure systolic,sbp,97,arterial
19353,23581541,2160-05-18 22:00:00,Arterial Blood Pressure systolic,sbp,123,arterial
19376,23581541,2160-05-18 22:10:00,Arterial Blood Pressure systolic,sbp,94,arterial
19385,23581541,2160-05-18 23:00:00,Arterial Blood Pressure systolic,sbp,116,arterial
...,...,...,...,...,...,...
313630748,26785317,2148-11-20 06:00:00,Arterial Blood Pressure systolic,sbp,124,arterial
313630778,26785317,2148-11-20 20:00:00,Arterial Blood Pressure systolic,sbp,104,arterial
313630874,26785317,2148-11-20 21:00:00,Arterial Blood Pressure systolic,sbp,102,arterial
313630901,26785317,2148-11-20 22:00:00,Arterial Blood Pressure systolic,sbp,105,arterial


### `d_items` counting

In [7]:
chartevents.loc[chartevents["itemid"] == 224642, "value"].unique()

array(['Oral', 'Axillary', 'Esophogeal', 'Tympanic', 'Rectal', 'Blood',
       'Temporal'], dtype=object)

In [15]:
chartevents.loc[
    (chartevents["itemid"].isin([224642,223762])) & (chartevents["stay_id"] == 39553978),
    :
]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
18,10000032,29079034,39553978,66056.0,2180-07-23 20:00:00,2180-07-23 19:59:00,224642,Oral,,,0.0
160,10000032,29079034,39553978,88981.0,2180-07-23 14:00:00,2180-07-23 14:18:00,224642,Oral,,,0.0
370,10000032,29079034,39553978,88981.0,2180-07-23 17:00:00,2180-07-23 17:03:00,224642,Oral,,,0.0


In [24]:
temp = chartevents.loc[
    (chartevents["itemid"].isin([224642,223762])),
    :
]

temp

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
18,10000032,29079034,39553978,66056.0,2180-07-23 20:00:00,2180-07-23 19:59:00,224642,Oral,,,0.0
160,10000032,29079034,39553978,88981.0,2180-07-23 14:00:00,2180-07-23 14:18:00,224642,Oral,,,0.0
370,10000032,29079034,39553978,88981.0,2180-07-23 17:00:00,2180-07-23 17:03:00,224642,Oral,,,0.0
551,10000980,26913865,39765666,36518.0,2189-06-27 09:07:00,2189-06-27 09:07:00,224642,Axillary,,,0.0
733,10000980,26913865,39765666,36518.0,2189-06-27 12:00:00,2189-06-27 12:53:00,224642,Oral,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313644553,19999987,23865745,36195440,86756.0,2145-11-03 04:00:00,2145-11-03 04:24:00,224642,Oral,,,0.0
313644735,19999987,23865745,36195440,90295.0,2145-11-02 23:24:00,2145-11-02 23:24:00,224642,Oral,,,0.0
313644811,19999987,23865745,36195440,90295.0,2145-11-02 23:41:00,2145-11-02 23:41:00,224642,Oral,,,0.0
313644876,19999987,23865745,36195440,91879.0,2145-11-04 20:00:00,2145-11-04 19:38:00,224642,Oral,,,0.0


In [25]:
temp.loc[
    chartevents["itemid"] == 223762,
    :
]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
8521,10001884,26184834,37510196,36518.0,2131-01-12 07:00:00,2131-01-12 08:07:00,223762,36.7,36.7,°C,0.0
8532,10001884,26184834,37510196,36518.0,2131-01-12 08:00:00,2131-01-12 08:07:00,223762,36.7,36.7,°C,0.0
8740,10001884,26184834,37510196,36518.0,2131-01-12 09:00:00,2131-01-12 09:06:00,223762,37,37.0,°C,0.0
8747,10001884,26184834,37510196,36518.0,2131-01-12 10:00:00,2131-01-12 10:21:00,223762,36.9,36.9,°C,0.0
8793,10001884,26184834,37510196,36518.0,2131-01-12 11:00:00,2131-01-12 11:26:00,223762,36.9,36.9,°C,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313599244,19998843,24842066,30988867,64147.0,2187-02-07 14:00:00,2187-02-07 14:36:00,223762,38.7,38.7,°C,0.0
313599278,19998843,24842066,30988867,64147.0,2187-02-07 15:00:00,2187-02-07 15:12:00,223762,38.6,38.6,°C,0.0
313599292,19998843,24842066,30988867,64147.0,2187-02-07 16:00:00,2187-02-07 16:18:00,223762,38.5,38.5,°C,0.0
313599369,19998843,24842066,30988867,64147.0,2187-02-07 17:00:00,2187-02-07 17:10:00,223762,38.5,38.5,°C,0.0


# `vitals`

## `respiratory_rate`

In [80]:
lookup_item("respiratory rate", cleaner=True)

Unnamed: 0,itemid,label,abbreviation,category,unitname,param_type,count
28,220210,Respiratory Rate,RR,Respiratory,insp/min,Numeric,6393762
800,224689,Respiratory Rate (spontaneous),Respiratory Rate (spontaneous),Respiratory,insp/min,Numeric,592180
801,224690,Respiratory Rate (Total),Respiratory Rate (Total),Respiratory,insp/min,Numeric,550398
799,224688,Respiratory Rate (Set),Respiratory Rate (Set),Respiratory,insp/min,Numeric,342289


In [None]:
chartevents

## `temp_c`

In [75]:
out = lookup_item("temperature", cleaner=True)
out

Unnamed: 0,itemid,label,abbreviation,category,unitname,param_type,count
767,224642,Temperature Site,Temp Site,Routine Vital Signs,,Text,1842387.0
337,223761,Temperature Fahrenheit,Temperature F,Routine Vital Signs,°F,Numeric,1515962.0
505,224027,Skin Temperature,Skin Temp,Skin - Assessment,,Text,952619.0
338,223762,Temperature Celsius,Temperature C,Routine Vital Signs,°C,Numeric,264628.0
1814,226329,Blood Temperature CCO (C),Blood Temp CCO (C),Routine Vital Signs,°C,Numeric,151805.0
3466,229236,Cerebral Temperature (C),Cerebral T (C),Hemodynamics,°C,Numeric,2469.0
790,224674,Changes in Temperature,Changes in Temperature,Toxicology,,Text,1972.0
2097,227054,TemperatureF_ApacheIV,TemperatureF_ApacheIV,Scores - APACHE IV (2),°F,Numeric,7.0
2776,228242,Pt. Temperature (BG) (SOFT),Pt. Temperature (BG) (SOFT),Labs,,Numeric,


In [73]:
out.sort_values(by = "count", ascending = False)

Unnamed: 0,itemid,label,abbreviation,category,unitname,param_type,count
767,224642,Temperature Site,Temp Site,Routine Vital Signs,,Text,1842387.0
337,223761,Temperature Fahrenheit,Temperature F,Routine Vital Signs,°F,Numeric,1515962.0
505,224027,Skin Temperature,Skin Temp,Skin - Assessment,,Text,952619.0
338,223762,Temperature Celsius,Temperature C,Routine Vital Signs,°C,Numeric,264628.0
1814,226329,Blood Temperature CCO (C),Blood Temp CCO (C),Routine Vital Signs,°C,Numeric,151805.0
3466,229236,Cerebral Temperature (C),Cerebral T (C),Hemodynamics,°C,Numeric,2469.0
790,224674,Changes in Temperature,Changes in Temperature,Toxicology,,Text,1972.0
2097,227054,TemperatureF_ApacheIV,TemperatureF_ApacheIV,Scores - APACHE IV (2),°F,Numeric,7.0
2776,228242,Pt. Temperature (BG) (SOFT),Pt. Temperature (BG) (SOFT),Labs,,Numeric,


In [51]:
ids = out["itemid"].values

out2 = chartevents.loc[
    chartevents["itemid"].isin(ids),
    :
]

out2

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
7,10000032,29079034,39553978,47007.0,2180-07-23 22:00:00,2180-07-23 22:15:00,220210,20,20.0,insp/min,0.0
13,10000032,29079034,39553978,66056.0,2180-07-23 19:00:00,2180-07-23 19:59:00,220210,16,16.0,insp/min,0.0
24,10000032,29079034,39553978,66056.0,2180-07-23 20:00:00,2180-07-23 21:01:00,220210,19,19.0,insp/min,0.0
62,10000032,29079034,39553978,66056.0,2180-07-23 21:00:00,2180-07-23 21:01:00,220210,22,22.0,insp/min,0.0
167,10000032,29079034,39553978,88981.0,2180-07-23 14:12:00,2180-07-23 14:17:00,220210,24,24.0,insp/min,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313644828,19999987,23865745,36195440,91429.0,2145-11-03 11:30:00,2145-11-03 12:01:00,224689,0,0.0,insp/min,0.0
313644829,19999987,23865745,36195440,91429.0,2145-11-03 11:30:00,2145-11-03 12:01:00,224690,20,20.0,insp/min,0.0
313644856,19999987,23865745,36195440,91879.0,2145-11-04 19:00:00,2145-11-04 19:30:00,220210,20,20.0,insp/min,0.0
313644879,19999987,23865745,36195440,91879.0,2145-11-04 20:00:00,2145-11-04 20:02:00,220210,23,23.0,insp/min,0.0


In [67]:
out3 = out2.value_counts("itemid")

out.join(out3, on = "itemid", validate = "1:1").sort_values(by=["count"])

ValueError: columns overlap but no suffix specified: Index(['count'], dtype='object')

In [62]:
out3

itemid
220210    6393762
224689     592180
224690     550398
224688     342289
Name: count, dtype: int64