In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# load in the csv files as dataframes
BASE_DIR = os.path.expanduser("~/Fluid-Solutions-ML/data/raw/")
items_df = pd.read_csv(os.path.join(BASE_DIR, "d_items.csv"))
chart_df = pd.read_csv(os.path.join(BASE_DIR, "chartevents.csv"))
fluid_input_df = pd.read_csv(os.path.join(BASE_DIR, "inputevents.csv"))
fluid_output_df = pd.read_csv(os.path.join(BASE_DIR, "outputevents.csv"))

- **d_items.csv** -> stores a list of all events/items and their ids. Prolly used as a util table for relational mapping
- **chartevents.csv** -> stores all chart value items. This is where we will get the vitals data from
- **inputevents.csv** -> Stores all things that go into a patient, mainly medications and fluids. We only care about fluids, therefore, we only care about the mL rows
- **outputevents.csv** -> stores all fluid outputs

In [117]:
# convert the time-related columns to datetime objects so I can apply some logic on them later
def convert_timestamps(dataframe, time_columns):
    for col in time_columns:
        if col in dataframe.columns:
            dataframe[col] = pd.to_datetime(dataframe[col], errors='coerce')
    return dataframe

chart_df = convert_timestamps(chart_df, ['charttime', 'storetime'])
fluid_input_df = convert_timestamps(fluid_input_df, ['starttime', 'endtime', 'storetime'])
fluid_output_df = convert_timestamps(fluid_output_df, ['charttime', 'charttime'])

In order to group the data, we first need ot figure out which items (corresponding to the chart event) we want to consider to use for the vitals. The hard part about this is that each of the vitals signs we want all have multiple itemids which could be used. The filtered list of itemids and their label are in item_names.


- cvp -> 220074
- pulse pressure -> sys-dias
- heart rate -> 220045
- spo2 -> 228232 (?)
- systolic bp -> 227243, 220179, 224167
- diastolic bp -> 227242, 220180, 224643
- non invasive mean bp -> 220181
- systolic art pressure -> 225309, 220050
- diastolic art pressure -> 220051, 225310
- mean art pressure (map) -> 220052, 225312

In [None]:
vitals_item_ids = {
    220074: "cvp",
    220045: "heart_rate",
    228232: "spo2",
    220277: "spo2",
    227243: "systolic_bp",
    224167: "manual_blood_pressure_systolic_left",
    224643: "manual_blood_pressure_diastolic_left",
    227242: "diastolic_bp",
    220179: "non_invasive_systolic_bp",
    220180: "non_invasive_diastolic_bp",
    220181: "non_invasive_mean_bp",
    225309: "systolic_arterial_pressure",
    220050: "systolic_arterial_pressure",
    225310: "diastolic_arterial_pressure",
    220051: "diastolic_arterial_pressure",
    220052: "mean_arterial_pressure",
    225312: "mean_arterial_pressure",
    220210: "respiratory_rate"
}

# vitals_item_ids.keys()

### Data Grouping:
1. Loop through the records in input and output events
2. For each record, get the associated timestamp
3. Find the chart events for the patient assocaited with the record 
4. Find the chart events with the closeest timestamp to the record
5. Get the specific vitals/data points we watn from these chart events and store them in a dataframes


**Because we cannot really train the model right now with ppv values (there are not enough consecutive readings to get the min and max pp of a respiratory cycle), we will haev to train the model using pp values for now**

In [119]:
# for now, I'm just going to focus on the fluid input events:

# cleanup the data in the fluid input table
fluid_input_df = fluid_input_df.dropna(subset=['itemid', 'amount', 'starttime', 'endtime'])

# drop all rows when the administered volume is 0 and make sure we are only looking at volumes/fluids
fluid_input_df = fluid_input_df[
    (fluid_input_df['amount'] > 0) &
    (fluid_input_df['amountuom'].astype(str) == "ml")
]

# cleanup the data in the chart evens table 
chart_df = chart_df.dropna(subset=['itemid', 'valueuom', 'charttime', 'valuenum', 'value'])

In [120]:
# Test code to make sure what I am iterating through works
# timestamp = "2132-12-16 06:23:00" # for now use starttime becuase that should be closer to the chartevent time
# patient_stay_id = str(20626031)
# training_label = "low" # if they administered fluid, then the patients blood volume status is low

# # Find the chart events (vitals) associated with the patient
# patient_chart_events = chart_df[
#     (chart_df['hadm_id'].astype(str) == patient_stay_id) &
#     (chart_df['charttime'] <= timestamp) &
#     (chart_df['charttime'] >= "2132-12-16 06:00:00") &
#     (chart_df['itemid'].astype(int).isin(vitals_item_ids))
# ]

# patient_chart_events

In [None]:
training_data_rows = []

# iterate through each record
for row in fluid_input_df.itertuples():
    new_row = {}
    timestamp = row.starttime # for now use starttime becuase that should be closer to the chartevent time
    patient_stay_id = str(row.hadm_id) # this is an id which is given to a patient per each stay in the ICU

    # Find the chart events (vitals) associated with the patient
    patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == patient_stay_id) &
        (chart_df['charttime'] <= timestamp) &
        (chart_df['charttime'] > timestamp-timedelta(hours=1)) &
        (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
    ]

    # TODO: add each of the data values in the patient_chart_events to a dictionary
    # which will be converted into a dataframe

    if not patient_chart_events.empty:
        for row in patient_chart_events.itertuples():
            new_row[vitals_item_ids[row.itemid]] = row.valuenum

        new_row['label'] = "low"
        new_row['timestamp'] = timestamp.to_datetime64()
        new_row['hadm_id'] = patient_stay_id

        training_data_rows.append(new_row)

    # this is some weird goruping stuff chatGPT gave me, try it out later
    # if not patient_chart_events.empty:
    #     aggregated_features = patient_chart_events.groupby('itemid')['valuenum'].agg(['mean', 'std', 'min', 'max']).unstack()
    # else:
    #     aggregated_features = pd.Series(dtype="float64")

    # Convert to dictionary for easier merging
    # patient_record = {"hadm_id": patient_stay_id, "label": "low"}
    # patient_record.update(aggregated_features.to_dict())


training_data_set = pd.DataFrame(training_data_rows)
training_data_set.to_csv("~/Fluid-Solutions-ML/data/processed/training_data_fluid_in.csv")

# patient_chart_events
# aggregated_features
training_data_set

      itemid
mean  220045     88.0
      220050    130.0
      220051     42.0
      220052     68.0
      220074     10.0
      220210     21.0
      220277     94.0
std   220045      NaN
      220050      NaN
      220051      NaN
      220052      NaN
      220074      NaN
      220210      NaN
      220277      NaN
min   220045     88.0
      220050    130.0
      220051     42.0
      220052     68.0
      220074     10.0
      220210     21.0
      220277     94.0
max   220045     88.0
      220050    130.0
      220051     42.0
      220052     68.0
      220074     10.0
      220210     21.0
      220277     94.0
dtype: float64

In [None]:
training_data = pd.read_csv("~/Fluid-Solutions-ML/data/processed/training_data_fluid_in.csv")

drop_rows = ['diastolic_arterial_pressure', 'cvp', 'systolic_arterial_pressure', 'respiratory_rate',
             'mean_arterial_pressure', 'heart_rate', 'spo2']

drop_columns = ['non_invasive_systolic_bp', 'non_invasive_mean_bp', 'diastolic_bp',
                'systolic_bp', 'manual_blood_pressure_diastolic_left',
                'manual_blood_pressure_systolic_left']

# NOTE: here I am dropping the noninvasive bp and other columns, however, if we desperately need
# more data then we can use these columns to repalce the NAN values in the primary columns.
training_data = training_data.dropna(subset=drop_rows).drop(columns=drop_columns)
training_data.to_csv("~/Fluid-Solutions-ML/data/processed/MIMIC_IV_fluid_in_training_data.csv")

training_data


Unnamed: 0.1,Unnamed: 0,diastolic_arterial_pressure,cvp,systolic_arterial_pressure,respiratory_rate,mean_arterial_pressure,heart_rate,spo2,label,timestamp,hadm_id,non_invasive_diastolic_bp
0,0,42.0,10.0,130.0,21.0,68.0,88.0,94.0,low,2132-12-16 06:23:00,20626031,
1,1,40.0,11.0,111.0,24.0,57.0,79.0,95.0,low,2132-12-15 18:58:00,20626031,
3,3,48.0,10.0,127.0,16.0,71.0,68.0,100.0,low,2132-12-15 16:07:00,20626031,
4,4,52.0,2.0,141.0,15.0,81.0,79.0,100.0,low,2132-12-15 14:34:00,20626031,
5,5,40.0,11.0,111.0,24.0,57.0,79.0,95.0,low,2132-12-15 19:12:00,20626031,
...,...,...,...,...,...,...,...,...,...,...,...,...
10313,10313,57.0,8.0,132.0,20.0,83.0,89.0,100.0,low,2134-08-05 23:00:00,28861356,60.0
10318,10318,49.0,1.0,131.0,30.0,72.0,83.0,100.0,low,2134-08-06 10:42:00,28861356,
10321,10321,49.0,1.0,131.0,30.0,72.0,83.0,100.0,low,2134-08-06 10:44:00,28861356,
10328,10328,40.0,5.0,96.0,37.0,57.0,88.0,100.0,low,2134-08-05 21:31:00,28861356,51.0


In [None]:
# Attempt to do the output stuff here, however, we have to be careful that we are not assuming
# the chartevents (from above) are more closely associated with an output event rather than
# an input event. 

