In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# load in the csv files as dataframes
BASE_DIR = os.path.expanduser("~/Fluid-Solutions-ML/data/raw/")
items_df = pd.read_csv(os.path.join(BASE_DIR, "d_items.csv"))
chart_df = pd.read_csv(os.path.join(BASE_DIR, "chartevents.csv"))
fluid_input_df = pd.read_csv(os.path.join(BASE_DIR, "inputevents.csv"))
fluid_output_df = pd.read_csv(os.path.join(BASE_DIR, "outputevents.csv"))

- **d_items.csv** -> stores a list of all events/items and their ids. Prolly used as a util table for relational mapping
- **chartevents.csv** -> stores all chart value items. This is where we will get the vitals data from
- **inputevents.csv** -> Stores all things that go into a patient, mainly medications and fluids. We only care about fluids, therefore, we only care about the mL rows
- **outputevents.csv** -> stores all fluid outputs

In [2]:
# convert the time-related columns to datetime objects so I can apply some logic on them later
def convert_timestamps(dataframe, time_columns):
    for col in time_columns:
        if col in dataframe.columns:
            dataframe[col] = pd.to_datetime(dataframe[col], errors='coerce')
    return dataframe

chart_df = convert_timestamps(chart_df, ['charttime', 'storetime'])
fluid_input_df = convert_timestamps(fluid_input_df, ['starttime', 'endtime', 'storetime'])
fluid_output_df = convert_timestamps(fluid_output_df, ['charttime', 'charttime'])

In order to group the data, we first need ot figure out which items (corresponding to the chart event) we want to consider to use for the vitals. The hard part about this is that each of the vitals signs we want all have multiple itemids which could be used. The filtered list of itemids and their label are in item_names.


- cvp -> 220074
- pulse pressure -> sys-dias
- heart rate -> 220045
- spo2 -> 228232 (?)
- systolic bp -> 227243, 220179, 224167
- diastolic bp -> 227242, 220180, 224643
- non invasive mean bp -> 220181
- systolic art pressure -> 225309, 220050
- diastolic art pressure -> 220051, 225310
- mean art pressure (map) -> 220052, 225312

In [3]:
vitals_item_ids = {
    220074: "cvp",
    220045: "heart_rate",
    228232: "spo2",
    220277: "spo2",
    227243: "systolic_bp",
    224167: "manual_blood_pressure_systolic_left",
    224643: "manual_blood_pressure_diastolic_left",
    227242: "diastolic_bp",
    220179: "non_invasive_systolic_bp",
    220180: "non_invasive_diastolic_bp",
    220181: "non_invasive_mean_bp",
    225309: "systolic_arterial_pressure",
    220050: "systolic_arterial_pressure",
    225310: "diastolic_arterial_pressure",
    220051: "diastolic_arterial_pressure",
    220052: "mean_arterial_pressure",
    225312: "mean_arterial_pressure",
    220210: "respiratory_rate"
}

# vitals_item_ids.keys()

### Data Grouping:
1. Loop through the records in input and output events
2. For each record, get the associated timestamp
3. Find the chart events for the patient assocaited with the record 
4. Find the chart events with the closeest timestamp to the record
5. Get the specific vitals/data points we watn from these chart events and store them in a dataframes


**Because we cannot really train the model right now with ppv values (there are not enough consecutive readings to get the min and max pp of a respiratory cycle), we will haev to train the model using pp values for now**

In [4]:
# for now, I'm just going to focus on the fluid input events:

# cleanup the data in the fluid input table
fluid_input_df = fluid_input_df.dropna(subset=['itemid', 'amount', 'starttime', 'endtime'])

# drop all rows when the administered volume is 0 and make sure we are only looking at volumes/fluids
fluid_input_df = fluid_input_df[
    (fluid_input_df['amount'] > 0) &
    (fluid_input_df['amountuom'].astype(str) == "ml")
]

# cleanup the data in the chart evens table 
chart_df = chart_df.dropna(subset=['itemid', 'valueuom', 'charttime', 'valuenum', 'value'])

fluid_output_df = fluid_output_df[
    (fluid_output_df['value'] > 0) &
    (fluid_output_df['valueuom'].astype(str) == "ml")
]

In [51]:
# Test code to make sure what I am iterating through works
timestamp = "2113-08-05 12:48:00" # for now use starttime becuase that should be closer to the chartevent time
patient_stay_id = str(21606243)
training_label = "low" # if they administered fluid, then the patients blood volume status is low

# Find the chart events (vitals) associated with the patient
patient_chart_events = chart_df[
    (chart_df['hadm_id'].astype(str) == patient_stay_id) & 
    (chart_df['charttime'] <= timestamp) &
    (chart_df['charttime'] >= "2113-08-05 11:48:00") &
    (chart_df['itemid'].astype(int).isin(vitals_item_ids))
]

patient_chart_events

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
97952,10031404,21606243,35544374,96611.0,2113-08-05 12:00:00,2113-08-05 12:48:00,220277,96,96.0,%,0.0
97956,10031404,21606243,35544374,78093.0,2113-08-05 12:00:00,2113-08-05 12:43:00,220045,116,116.0,bpm,0.0
97957,10031404,21606243,35544374,78093.0,2113-08-05 12:00:00,2113-08-05 12:43:00,220210,22,22.0,insp/min,0.0
98502,10031404,21606243,35544374,96611.0,2113-08-05 12:47:00,2113-08-05 12:51:00,220179,135,135.0,mmHg,0.0
98503,10031404,21606243,35544374,96611.0,2113-08-05 12:47:00,2113-08-05 12:51:00,220180,62,62.0,mmHg,0.0
98504,10031404,21606243,35544374,96611.0,2113-08-05 12:47:00,2113-08-05 12:51:00,220181,81,81.0,mmHg,0.0


In [None]:
training_data_rows = []

# iterate through each record
for row in fluid_input_df.itertuples():
    new_row = {}
    timestamp = row.starttime # for now use starttime becuase that should be closer to the chartevent time
    patient_stay_id = str(row.hadm_id) # this is an id which is given to a patient per each stay in the ICU

    # Find the chart events (vitals) associated with the patient
    patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == patient_stay_id) &
        (chart_df['charttime'] <= timestamp) &
        (chart_df['charttime'] > timestamp-timedelta(hours=1)) &
        (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
    ]

    if not patient_chart_events.empty:
        for time in patient_chart_events['charttime'].unique():
            datapoint = {}
            rows_assocaited_with_timestamp = patient_chart_events[patient_chart_events['charttime'] == time]
            if len(rows_assocaited_with_timestamp) >= 7:
                for row in rows_assocaited_with_timestamp.itertuples():
                    datapoint[vitals_item_ids[row.itemid]] = row.valuenum
                datapoint['timestamp'] = time
                datapoint['hadm_id'] = patient_stay_id
                datapoint['label'] = 'low'
                training_data_rows.append(datapoint)
        
        # for datapoint in patient_chart_events.itertuples():
        #     new_row[vitals_item_ids[datapoint.itemid]] = datapoint.valuenum

        # new_row['label'] = "low"
        # new_row['timestamp'] = timestamp.to_datetime64()
        # new_row['hadm_id'] = patient_stay_id

        # training_data_rows.append(new_row)

    # this is some weird goruping stuff chatGPT gave me, try it out later
    # if not patient_chart_events.empty:
    #     aggregated_features = patient_chart_events.groupby('itemid')['valuenum'].agg(['mean', 'std', 'min', 'max']).unstack()
    # else:
    #     aggregated_features = pd.Series(dtype="float64")

    # Convert to dictionary for easier merging
    # patient_record = {"hadm_id": patient_stay_id, "label": "low"}
    # patient_record.update(aggregated_features.to_dict())


training_data_set = pd.DataFrame(training_data_rows)
training_data_set.to_csv("~/Fluid-Solutions-ML/data/processed/training_data_fluid_in.csv")

# patient_chart_events
# aggregated_features
training_data_set

Unnamed: 0,diastolic_arterial_pressure,cvp,systolic_arterial_pressure,respiratory_rate,mean_arterial_pressure,heart_rate,spo2,timestamp,hadm_id,label,non_invasive_mean_bp,non_invasive_systolic_bp,non_invasive_diastolic_bp
0,42.0,10.0,130.0,21.0,68.0,88.0,94.0,2132-12-16 06:00:00,20626031,low,,,
1,42.0,8.0,116.0,16.0,62.0,63.0,100.0,2132-12-15 18:00:00,20626031,low,,,
2,57.0,16.0,133.0,9.0,80.0,74.0,100.0,2132-12-15 18:15:00,20626031,low,,,
3,67.0,13.0,172.0,15.0,96.0,84.0,100.0,2132-12-15 18:20:00,20626031,low,,,
4,59.0,13.0,143.0,17.0,80.0,90.0,98.0,2132-12-15 18:25:00,20626031,low,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2306,40.0,3.0,83.0,20.0,55.0,89.0,100.0,2134-08-05 22:00:00,28861356,low,55.0,95.0,44.0
2307,40.0,4.0,89.0,24.0,56.0,86.0,100.0,2134-08-05 21:45:00,28861356,low,,,
2308,,9.0,,24.0,,96.0,95.0,2153-03-28 04:00:00,29279905,low,65.0,93.0,59.0
2309,,9.0,,24.0,,96.0,95.0,2153-03-28 04:00:00,29279905,low,65.0,93.0,59.0


In [12]:
training_data = pd.read_csv("~/Fluid-Solutions-ML/data/processed/training_data_fluid_in.csv")

drop_rows = ['diastolic_arterial_pressure', 'cvp', 'systolic_arterial_pressure', 'respiratory_rate',
             'mean_arterial_pressure', 'heart_rate', 'spo2']

drop_columns = ['non_invasive_systolic_bp', 'non_invasive_mean_bp','non_invasive_diastolic_bp']

# NOTE: here I am dropping the noninvasive bp and other columns, however, if we desperately need
# more data then we can use these columns to repalce the NAN values in the primary columns.
training_data = training_data.dropna(subset=drop_rows).drop(columns=drop_columns)

training_data

Unnamed: 0.1,Unnamed: 0,diastolic_arterial_pressure,cvp,systolic_arterial_pressure,respiratory_rate,mean_arterial_pressure,heart_rate,spo2,timestamp,hadm_id,label
0,0,42.0,10.0,130.0,21.0,68.0,88.0,94.0,2132-12-16 06:00:00,20626031,low
1,1,42.0,8.0,116.0,16.0,62.0,63.0,100.0,2132-12-15 18:00:00,20626031,low
2,2,57.0,16.0,133.0,9.0,80.0,74.0,100.0,2132-12-15 18:15:00,20626031,low
3,3,67.0,13.0,172.0,15.0,96.0,84.0,100.0,2132-12-15 18:20:00,20626031,low
4,4,59.0,13.0,143.0,17.0,80.0,90.0,98.0,2132-12-15 18:25:00,20626031,low
...,...,...,...,...,...,...,...,...,...,...,...
2301,2301,61.0,8.0,146.0,18.0,89.0,88.0,100.0,2134-08-05 23:00:00,28861356,low
2302,2302,49.0,1.0,131.0,30.0,72.0,83.0,100.0,2134-08-06 10:00:00,28861356,low
2303,2303,49.0,1.0,131.0,30.0,72.0,83.0,100.0,2134-08-06 10:00:00,28861356,low
2306,2306,40.0,3.0,83.0,20.0,55.0,89.0,100.0,2134-08-05 22:00:00,28861356,low


In [13]:
len(training_data['timestamp'].unique())

700

this is random stuff because the data I have makes absolutely no sense

In [6]:
# Now lets do the same with the output events to create the other training data set
training_data_fluid_out_rows = []

for row in fluid_output_df.itertuples():
    new_row = {}
    hadm_id = str(row.hadm_id)
    timestamp = row.charttime

    patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == hadm_id) &
        (chart_df['charttime'] <= timestamp) &
        (chart_df['charttime'] > timestamp-timedelta(hours=1)) &
        (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
    ]

    if not patient_chart_events.empty:
        for time in patient_chart_events['charttime'].unique():
            datapoint = {}
            rows_assocaited_with_timestamp = patient_chart_events[patient_chart_events['charttime'] == time]
            if len(rows_assocaited_with_timestamp) >= 7:
                for row in rows_assocaited_with_timestamp.itertuples():
                    datapoint[vitals_item_ids[row.itemid]] = row.valuenum
                datapoint['timestamp'] = time
                datapoint['hadm_id'] = hadm_id
                datapoint['label'] = 'high'
                training_data_fluid_out_rows.append(datapoint)

fluid_output_training_df = pd.DataFrame(training_data_fluid_out_rows)
fluid_output_training_df

Unnamed: 0,mean_arterial_pressure,spo2,heart_rate,cvp,respiratory_rate,diastolic_arterial_pressure,systolic_arterial_pressure,timestamp,hadm_id,label,non_invasive_systolic_bp,non_invasive_diastolic_bp,non_invasive_mean_bp
0,60.0,96.0,55.0,11.0,14.0,44.0,92.0,2123-02-23 19:00:00,22205327,high,,,
1,75.0,97.0,75.0,12.0,14.0,53.0,119.0,2123-02-24 06:00:00,22205327,high,,,
2,68.0,96.0,59.0,11.0,13.0,50.0,101.0,2123-02-23 20:00:00,22205327,high,,,
3,103.0,95.0,116.0,16.0,13.0,88.0,138.0,2196-02-25 22:00:00,24181354,high,,,
4,104.0,96.0,110.0,17.0,14.0,83.0,140.0,2196-02-25 21:00:00,24181354,high,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2083,72.0,97.0,108.0,13.0,22.0,59.0,98.0,2116-12-05 01:00:00,28998349,high,,,
2084,77.0,95.0,106.0,12.0,22.0,61.0,107.0,2116-12-04 23:00:00,28998349,high,,,
2085,70.0,95.0,105.0,11.0,22.0,55.0,98.0,2116-12-04 22:30:00,28998349,high,,,
2086,77.0,95.0,106.0,12.0,22.0,61.0,107.0,2116-12-04 23:00:00,28998349,high,,,


In [7]:
drop_rows = ['diastolic_arterial_pressure', 'cvp', 'systolic_arterial_pressure', 'respiratory_rate',
             'mean_arterial_pressure', 'heart_rate', 'spo2']

drop_columns = ['non_invasive_systolic_bp', 'non_invasive_mean_bp','non_invasive_diastolic_bp']

# NOTE: here I am dropping the noninvasive bp and other columns, however, if we desperately need
# more data then we can use these columns to repalce the NAN values in the primary columns.
fluid_output_training_df = fluid_output_training_df.dropna(subset=drop_rows).drop(columns=drop_columns)

fluid_output_training_df

Unnamed: 0,mean_arterial_pressure,spo2,heart_rate,cvp,respiratory_rate,diastolic_arterial_pressure,systolic_arterial_pressure,timestamp,hadm_id,label
0,60.0,96.0,55.0,11.0,14.0,44.0,92.0,2123-02-23 19:00:00,22205327,high
1,75.0,97.0,75.0,12.0,14.0,53.0,119.0,2123-02-24 06:00:00,22205327,high
2,68.0,96.0,59.0,11.0,13.0,50.0,101.0,2123-02-23 20:00:00,22205327,high
3,103.0,95.0,116.0,16.0,13.0,88.0,138.0,2196-02-25 22:00:00,24181354,high
4,104.0,96.0,110.0,17.0,14.0,83.0,140.0,2196-02-25 21:00:00,24181354,high
...,...,...,...,...,...,...,...,...,...,...
2083,72.0,97.0,108.0,13.0,22.0,59.0,98.0,2116-12-05 01:00:00,28998349,high
2084,77.0,95.0,106.0,12.0,22.0,61.0,107.0,2116-12-04 23:00:00,28998349,high
2085,70.0,95.0,105.0,11.0,22.0,55.0,98.0,2116-12-04 22:30:00,28998349,high
2086,77.0,95.0,106.0,12.0,22.0,61.0,107.0,2116-12-04 23:00:00,28998349,high


In [8]:
len(fluid_output_training_df['timestamp'].unique())

936

In [None]:
training_data = pd.read_csv("~/Fluid-Solutions-ML/data/processed/training_data_fluid_in.csv")

drop_rows = ['diastolic_arterial_pressure', 'cvp', 'systolic_arterial_pressure', 'respiratory_rate',
             'mean_arterial_pressure', 'heart_rate', 'spo2']

drop_columns = ['non_invasive_systolic_bp', 'non_invasive_mean_bp', 'diastolic_bp',
                'systolic_bp', 'manual_blood_pressure_diastolic_left',
                'manual_blood_pressure_systolic_left', 'non_invasive_diastolic_bp']

# NOTE: here I am dropping the noninvasive bp and other columns, however, if we desperately need
# more data then we can use these columns to repalce the NAN values in the primary columns.
training_data = training_data.dropna(subset=drop_rows).drop(columns=drop_columns)
training_data.to_csv("~/Fluid-Solutions-ML/data/processed/MIMIC_IV_fluid_in_training_data.csv")

training_data

Unnamed: 0.1,Unnamed: 0,diastolic_arterial_pressure,cvp,systolic_arterial_pressure,respiratory_rate,mean_arterial_pressure,heart_rate,spo2,label,timestamp,hadm_id,non_invasive_diastolic_bp
0,0,42.0,10.0,130.0,21.0,68.0,88.0,94.0,low,2132-12-16 06:23:00,20626031,
1,1,40.0,11.0,111.0,24.0,57.0,79.0,95.0,low,2132-12-15 18:58:00,20626031,
3,3,48.0,10.0,127.0,16.0,71.0,68.0,100.0,low,2132-12-15 16:07:00,20626031,
4,4,52.0,2.0,141.0,15.0,81.0,79.0,100.0,low,2132-12-15 14:34:00,20626031,
5,5,40.0,11.0,111.0,24.0,57.0,79.0,95.0,low,2132-12-15 19:12:00,20626031,
...,...,...,...,...,...,...,...,...,...,...,...,...
10313,10313,57.0,8.0,132.0,20.0,83.0,89.0,100.0,low,2134-08-05 23:00:00,28861356,60.0
10318,10318,49.0,1.0,131.0,30.0,72.0,83.0,100.0,low,2134-08-06 10:42:00,28861356,
10321,10321,49.0,1.0,131.0,30.0,72.0,83.0,100.0,low,2134-08-06 10:44:00,28861356,
10328,10328,40.0,5.0,96.0,37.0,57.0,88.0,100.0,low,2134-08-05 21:31:00,28861356,51.0


In [47]:
# Attempt to do the output stuff here, however, we have to be careful that we are not assuming
# the chartevents (from above) are more closely associated with an output event rather than
# an input event. 

singular_patient_output = fluid_output_df[
    fluid_output_df['hadm_id'] == 20626031
]

singular_patient_output

singualr_patient_output_vitals = chart_df[
    (chart_df['hadm_id'] == 20626031) &
    (chart_df['charttime'] <= "2132-12-16 18:00:00") &
    (chart_df['charttime'] > "2132-12-16 17:00:00") &
    (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
]

singualr_patient_output_vitals

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
671,10005817,20626031,32604416,79166.0,2132-12-16 18:00:00,2132-12-16 18:10:00,220277,97,97.0,%,0.0
674,10005817,20626031,32604416,79166.0,2132-12-16 18:00:00,2132-12-16 18:10:00,220045,63,63.0,bpm,0.0
677,10005817,20626031,32604416,79166.0,2132-12-16 18:00:00,2132-12-16 18:10:00,220180,43,43.0,mmHg,0.0
682,10005817,20626031,32604416,79166.0,2132-12-16 18:00:00,2132-12-16 18:10:00,220181,56,56.0,mmHg,0.0
683,10005817,20626031,32604416,79166.0,2132-12-16 18:00:00,2132-12-16 18:10:00,220210,14,14.0,insp/min,0.0
684,10005817,20626031,32604416,79166.0,2132-12-16 18:00:00,2132-12-16 18:10:00,220179,107,107.0,mmHg,0.0


### TODO for the Fluid output table:
1. Go through the itemids in the output table and figure out which ones we want to use as acceptable fluid output events. 

In [None]:
# Now lets do the same with the output events to create the other training data set
training_data_fluid_out_rows = []

for row in fluid_output_df.itertuples():
    new_row = {}
    hadm_id = str(row.hadm_id)
    timestamp = row.charttime

    patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == hadm_id) &
        (chart_df['charttime'] <= timestamp) &
        (chart_df['charttime'] > timestamp-timedelta(hours=1)) &
        (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
    ]

    if not patient_chart_events.empty:
        for datapoint in patient_chart_events.itertuples():
            new_row[vitals_item_ids[datapoint.itemid]] = datapoint.value

        new_row['label'] = "high"
        new_row['timestamp'] = timestamp.to_datetime64()
        new_row['hadm_id'] = hadm_id

        training_data_fluid_out_rows.append(new_row)

fluid_output_training_df = pd.DataFrame(training_data_fluid_out_rows)
fluid_output_training_df


Unnamed: 0,mean_arterial_pressure,systolic_arterial_pressure,respiratory_rate,heart_rate,diastolic_arterial_pressure,spo2,label,timestamp,hadm_id,non_invasive_mean_bp,non_invasive_systolic_bp,non_invasive_diastolic_bp,cvp,manual_blood_pressure_diastolic_left,manual_blood_pressure_systolic_left,diastolic_bp,systolic_bp
0,73,106,18,93,51,99,low,2156-05-15 18:00:00,23473524,,,,,,,,
1,65,96,14,92,45,98,low,2156-05-15 12:00:00,23473524,61,91,53,,,,,
2,74,108,16,90,50,100,low,2156-05-15 13:00:00,23473524,,,,,,,,
3,113,140,17,90,100,99,low,2156-05-15 08:00:00,23473524,,,,,,,,
4,92,128,14,94,71,99,low,2156-05-15 14:00:00,23473524,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8916,,,22,81,,98,low,2178-07-07 11:00:00,29281842,93,134,81,,,,,
8917,,,22,79,,99,low,2178-07-07 15:00:00,29281842,,,,,,,,
8918,,,23,87,,98,low,2178-07-07 09:00:00,29281842,83,127,71,,,,,
8919,,,16,70,,99,low,2178-07-07 18:00:00,29281842,87,132,75,,,,,


In [15]:
# post process the dataframe here
drop_rows = ['diastolic_arterial_pressure', 'cvp', 'systolic_arterial_pressure', 'respiratory_rate',
             'mean_arterial_pressure', 'heart_rate', 'spo2']

drop_columns = ['non_invasive_systolic_bp', 'non_invasive_mean_bp', 'diastolic_bp',
                'systolic_bp', 'manual_blood_pressure_diastolic_left',
                'manual_blood_pressure_systolic_left', 'non_invasive_diastolic_bp']

# NOTE: here I am dropping the noninvasive bp and other columns, however, if we desperately need
# more data then we can use these columns to repalce the NAN values in the primary columns.
fluid_output_training_df = fluid_output_training_df.dropna(subset=drop_rows).drop(columns=drop_columns)
fluid_output_training_df

Unnamed: 0,mean_arterial_pressure,systolic_arterial_pressure,respiratory_rate,heart_rate,diastolic_arterial_pressure,spo2,label,timestamp,hadm_id,cvp
15,60,92,14,55,44,96,low,2123-02-23 19:00:00,22205327,11
17,75,119,14,75,53,97,low,2123-02-24 06:00:00,22205327,12
19,68,101,13,59,50,96,low,2123-02-23 20:00:00,22205327,11
55,103,138,13,116,88,95,low,2196-02-25 22:00:00,24181354,16
61,105,133,14,110,92,98,low,2196-02-25 21:00:00,24181354,18
...,...,...,...,...,...,...,...,...,...,...
8894,76,105,22,104,61,96,low,2116-12-05 02:00:00,28998349,13
8896,69,94,22,106,56,96,low,2116-12-04 22:00:00,28998349,13
8897,72,98,22,108,59,97,low,2116-12-05 01:00:00,28998349,13
8900,70,98,22,105,55,95,low,2116-12-04 23:00:00,28998349,11


In [16]:
# convert the fluid output dataframe into a csv file
fluid_output_training_df.to_csv("~/Fluid-Solutions-ML/data/processed/fluid_output_training_data.csv")

In [7]:
# Load the datasets
unique_output = pd.read_csv("~/Fluid-Solutions-ML/data/processed/fluid_output_training_data.csv")
unique_input = pd.read_csv("~/Fluid-Solutions-ML/data/processed/MIMIC_IV_fluid_in_training_data.csv")

# Convert timestamps to sets of strings (sets are much faster for lookups)
output_timestamps = set(map(str, unique_output['timestamp'].unique()))
input_timestamps = set(map(str, unique_input['timestamp'].unique()))

# Find timestamps unique to each dataset
timestamps_only_in_output = output_timestamps - input_timestamps
timestamps_only_in_input = input_timestamps - output_timestamps

# Count total unique datapoints
total_unique_datapoints = len(timestamps_only_in_output) + len(timestamps_only_in_input)

print(f"Timestamps only in output: {len(timestamps_only_in_output)}")
print(f"Timestamps only in input: {len(timestamps_only_in_input)}")
print(f"Total unique timestamps: {total_unique_datapoints}")

Timestamps only in output: 804
Timestamps only in input: 1195
Total unique timestamps: 1999


### Normal BVS Grouping
1. Get a list of all patient stay events
2. Create a list of input and output event timestamps
3. Loop through patient stay events.
    - For each patient, get a list of chart events and fluid input/output
    - Loop through the chart events associated with the current patient
        - if the charttime is too close to the fluid event, then skip, else, create a dictionary for the values and append it to a list
4. Create a dataframe out of the list of dictionarys created above
5. post-process the dataframe (check to make sure none of the datapoints overlap)

In [33]:
def is_too_close(curr_time, table, table_name):
    if table.empty:
        return False 
    
    if table_name == "fluid_in":
        for row in table.itertuples():
            if abs(row.starttime - curr_time) <= timedelta(hours=2):
                return True
                
    if table_name == "fluid_out":
        for row in table.itertuples():
            if abs(row.charttime - curr_time) <= timedelta(hours=2):
                return True

    return False

# Lets try to do some stuff for the normal blood volume status
patient_stays = pd.read_csv("~/Fluid-Solutions-ML/data/raw/icustays.csv")

patient_stays.dropna(inplace=True)

normal_data_rows = []

for patient in patient_stays.itertuples():
    curr_hadmi_id = str(patient.hadm_id)

    if pd.isna(curr_hadmi_id):
        continue

    # get the vitals associated with the patiet
    current_patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == curr_hadmi_id) & 
        (chart_df['itemid'].astype(int).isin(list(vitals_item_ids.keys())))
    ]

    if current_patient_chart_events.empty:
        continue

    # get the fluid administration values for the patient
    fluid_in_events = fluid_input_df[
        (fluid_input_df['hadm_id'].astype(str) == curr_hadmi_id)
    ]
    
    # get the dialysis values for the patient
    fluid_out_events = fluid_output_df[
        (fluid_output_df['hadm_id'].astype(str) == curr_hadmi_id)
    ]

    # Get unique charttimes to evaluate
    unique_charttimes = current_patient_chart_events['charttime'].unique()

    for chart_time in unique_charttimes:
        
        if is_too_close(chart_time, fluid_in_events, 'fluid_in') or is_too_close(chart_time, fluid_out_events, 'fluid_out'):
            continue

        # Get all vitals within a 2-hour window of this time
        time_window_events = current_patient_chart_events[
            (current_patient_chart_events['charttime'] <= chart_time) &
            (current_patient_chart_events['charttime'] > chart_time - timedelta(hours=2))
        ]

        for time in time_window_events['charttime'].unique():
            datapoint = {}
            rows_assocaited_with_timestamp = time_window_events[time_window_events['charttime'] == time]
            if len(rows_assocaited_with_timestamp) >= 7:
                for row in rows_assocaited_with_timestamp.itertuples():
                    datapoint[vitals_item_ids[row.itemid]] = row.valuenum

                normal_data_rows.append(datapoint)
    break

# normal_data_rows
time_window_events




Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
599601,10018328,23786647,31269608,75345.0,2154-04-30 17:00:00,2154-04-30 18:24:00,220045,62,62.0,bpm,0.0
599602,10018328,23786647,31269608,75345.0,2154-04-30 17:00:00,2154-04-30 18:24:00,220210,22,22.0,insp/min,0.0
599603,10018328,23786647,31269608,75345.0,2154-04-30 17:00:00,2154-04-30 18:24:00,220277,98,98.0,%,0.0
599604,10018328,23786647,31269608,75345.0,2154-04-30 18:00:00,2154-04-30 18:24:00,220210,15,15.0,insp/min,0.0
599605,10018328,23786647,31269608,75345.0,2154-04-30 18:00:00,2154-04-30 18:24:00,220277,96,96.0,%,0.0
599606,10018328,23786647,31269608,75345.0,2154-04-30 18:00:00,2154-04-30 18:24:00,220045,75,75.0,bpm,0.0
600268,10018328,23786647,31269608,75345.0,2154-04-30 17:02:00,2154-04-30 18:26:00,220179,146,146.0,mmHg,0.0
600269,10018328,23786647,31269608,75345.0,2154-04-30 17:02:00,2154-04-30 18:26:00,220180,72,72.0,mmHg,0.0
600270,10018328,23786647,31269608,75345.0,2154-04-30 17:02:00,2154-04-30 18:24:00,220181,93,93.0,mmHg,0.0
600271,10018328,23786647,31269608,75345.0,2154-04-30 18:02:00,2154-04-30 18:24:00,220181,90,90.0,mmHg,0.0


In [5]:
for id in vitals_item_ids.keys():
    num_rows = chart_df[
        chart_df['itemid'].astype(str) == str(id)
    ]

    print(f"{vitals_item_ids[id]}", len(num_rows))

cvp 1386
heart_rate 13913
spo2 0
spo2 13540
systolic_bp 5
manual_blood_pressure_systolic_left 3
manual_blood_pressure_diastolic_left 3
diastolic_bp 5
non_invasive_systolic_bp 8347
non_invasive_diastolic_bp 8349
non_invasive_mean_bp 8342
systolic_arterial_pressure 486
systolic_arterial_pressure 5525
diastolic_arterial_pressure 486
diastolic_arterial_pressure 5524
mean_arterial_pressure 5560
mean_arterial_pressure 488
respiratory_rate 13913
