In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# load in the csv files as dataframes
BASE_DIR = os.path.expanduser("~/Fluid-Solutions-ML/data/raw/")
items_df = pd.read_csv(os.path.join(BASE_DIR, "d_items.csv"))
chart_df = pd.read_csv(os.path.join(BASE_DIR, "chartevents.csv"))
fluid_input_df = pd.read_csv(os.path.join(BASE_DIR, "inputevents.csv"))
fluid_output_df = pd.read_csv(os.path.join(BASE_DIR, "outputevents.csv"))
patient_stays = pd.read_csv(os.path.join(BASE_DIR, "icustays.csv"))

In [3]:
# convert the time-related columns to datetime objects so I can apply some logic on them later
def convert_timestamps(dataframe, time_columns):
    for col in time_columns:
        if col in dataframe.columns:
            # dataframe[col] = pd.to_datetime(dataframe[col], errors='coerce').dt.round("min")
            dataframe[col] = pd.to_datetime(dataframe[col], errors='coerce')
    return dataframe

chart_df = convert_timestamps(chart_df, ['charttime', 'storetime'])
fluid_input_df = convert_timestamps(fluid_input_df, ['starttime', 'endtime', 'storetime'])
fluid_output_df = convert_timestamps(fluid_output_df, ['charttime', 'charttime'])

In [4]:
vitals_item_ids = {
    220045: "heart_rate",
    220277: "spo2",
    225309: "systolic_arterial_pressure",
    220050: "systolic_arterial_pressure",
    225310: "diastolic_arterial_pressure",
    220051: "diastolic_arterial_pressure",
    220052: "mean_arterial_pressure",
    225312: "mean_arterial_pressure",
    220210: "respiratory_rate",
    223762: "temperature"
}

# vitals_item_ids.keys()

In [5]:
# for now, I'm just going to focus on the fluid input events:

# cleanup the data in the fluid input table
fluid_input_df = fluid_input_df.dropna(subset=['itemid', 'amount', 'starttime', 'endtime'])

# drop all rows when the administered volume is 0 and make sure we are only looking at volumes/fluids
fluid_input_df = fluid_input_df[
    (fluid_input_df['amount'] > 0) &
    (fluid_input_df['amountuom'].astype(str) == "ml")
]

# cleanup the data in the chart evens table 
chart_df = chart_df.dropna(subset=['itemid', 'charttime', 'valuenum'])

fluid_output_df = fluid_output_df[
    (fluid_output_df['value'] > 0) &
    (fluid_output_df['valueuom'].astype(str) == "ml")
]

In [None]:
# initalize lists that will be used to hold dictionarys in order to later create a dataframe
fluid_input_rows = []
fluid_output_rows = []
normal_rows = []

# get a list of all patient ids
patient_ids = patient_stays["hadm_id"].unique()

# set the required number of vitals needed for a row
required_vitals_num = 7

total_time = []

# process each patient iteratively
for patient_id in patient_ids:

    # initalize a new array each iteration to hold the vitals timestamps for processing the normal events
    timestamps = []

    # get the fluid input events assocaited with the patient
    patient_input_events = fluid_input_df[
        fluid_input_df['hadm_id'].astype(str) == str(patient_id)
    ]

    # get the fluid output events (dialysis) assocaited with the patient
    patient_output_events = fluid_output_df[
        fluid_output_df['hadm_id'].astype(str) == str(patient_id)
    ]

    # get all chart events (vitals) associated with the patient (only chart events we will use as features in the model)
    patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == str(patient_id)) &
        (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
    ]

    # === Process Fluid Inpout Events (Labeled low) ===
    for input_event in patient_input_events.itertuples():
        curr_row_timestamp = input_event.starttime

        # look at the chart events 30 minutes prior to the fluid input event
        close_chart_events = patient_chart_events[
            (patient_chart_events['charttime'] <= curr_row_timestamp) &
            (patient_chart_events['charttime'] > curr_row_timestamp-timedelta(minutes=30))
            # (patient_chart_events['charttime'] > curr_row_timestamp-timedelta(hours=1))
        ]

        # skip if there are none
        if close_chart_events.empty:
            continue

        # go through each unique charttime found 
        for time in close_chart_events['charttime'].unique():
            new_row = {}
            chart_event_rows_per_timestamp = close_chart_events[close_chart_events['charttime'] == time]
            
            # if there are not enough vitals found, skip 
            if len(chart_event_rows_per_timestamp) < required_vitals_num:
                continue

            for row in chart_event_rows_per_timestamp.itertuples():
                new_row[vitals_item_ids[row.itemid]] = row.valuenum

            # add the timestamp, patient identifier, and label to the row
            new_row['timestamp'] = time
            new_row['hadm_id'] = patient_id
            new_row['label'] = "low"
        
            # store the timestamp for later processing and the data associated with the row
            timestamps.append(time)
            fluid_input_rows.append(new_row)


    # === Process Fluid Output Events (Labeled high) ===
    for output_event in patient_output_events.itertuples():
        curr_row_timestamp = output_event.charttime

        # look at the chart events in a 30-minute window before the fluid input event
        close_chart_events = patient_chart_events[
            (patient_chart_events['charttime'] <= curr_row_timestamp) &
            (patient_chart_events['charttime'] > curr_row_timestamp-timedelta(minutes=30))
            # (patient_chart_events['charttime'] > curr_row_timestamp-timedelta(hours=1))
        ]
        
        # skip if there are none
        if close_chart_events.empty:
            continue

        # go through each unique charttime found 
        for time in close_chart_events['charttime'].unique():
            new_row = {}
            chart_event_rows_per_timestamp = close_chart_events[close_chart_events['charttime'] == time]

            # if there are not enough vitals found, skip 
            if len(chart_event_rows_per_timestamp) < required_vitals_num:
                continue

            for row in chart_event_rows_per_timestamp.itertuples():
                new_row[vitals_item_ids[row.itemid]] = row.valuenum

            # add the timestamp, patient identifier, and label to the row
            new_row['timestamp'] = time
            new_row['hadm_id'] = patient_id
            new_row['label'] = "high"
        
            # store the timestamp for later processing and the data associated with the row
            timestamps.append(time)
            fluid_output_rows.append(new_row)


    # === Process Normal Fluid (Labeled normal) ===
    if not timestamps:
        normal_fluid_status_chart_events = patient_chart_events
    else: 
        # instead of directly matching the timestamps, get all events outside of a +/- 5 minute window
        mask = pd.Series(True, index=patient_chart_events.index)
        
        for time_series in timestamps:
            mask &= ~((patient_chart_events['charttime'] > time_series - timedelta(minutes=2)) & 
                    (patient_chart_events['charttime'] < time_series + timedelta(minutes=2)))

        normal_fluid_status_chart_events = patient_chart_events[mask]

    # if none are found, skip
    if normal_fluid_status_chart_events.empty:
        continue

    # same approach as above to popute a new list with dictionaries and create a dataframe from them 
    for chart_time in normal_fluid_status_chart_events['charttime'].unique():
        new_row = {}
        chart_event_rows_per_timestamp = normal_fluid_status_chart_events[normal_fluid_status_chart_events['charttime'] == chart_time]

        if len(chart_event_rows_per_timestamp) < required_vitals_num:
            continue

        for chart_row in chart_event_rows_per_timestamp.itertuples():
            new_row[vitals_item_ids[chart_row.itemid]] = chart_row.valuenum

        new_row['timestamp'] = chart_time
        new_row['hadm_id'] = patient_id
        new_row['label'] = "normal"

        normal_rows.append(new_row)

    total_time.append(timestamps)

fluid_input_trianing_data = pd.DataFrame(fluid_input_rows)
fluid_output_trianing_data = pd.DataFrame(fluid_output_rows)
normal_fluid_trianing_data = pd.DataFrame(normal_rows)

fluid_input_trianing_data.to_csv("~/Fluid-Solutions-ML/data/processed/input_testing.csv")
fluid_output_trianing_data.to_csv("~/Fluid-Solutions-ML/data/processed/output_testing.csv")
normal_fluid_trianing_data.to_csv("~/Fluid-Solutions-ML/data/processed/normal_testing.csv")

In [7]:
test_input = fluid_input_trianing_data.dropna()
print(len(test_input))
test_input
# fluid_input_trianing_data

416


Unnamed: 0,diastolic_arterial_pressure,systolic_arterial_pressure,mean_arterial_pressure,respiratory_rate,heart_rate,spo2,timestamp,hadm_id,label,temperature
8,70.0,111.0,85.0,18.0,101.0,100.0,2116-12-08 02:00:00,28998349,low,37.0
9,75.0,117.0,90.0,13.0,110.0,97.0,2116-12-08 08:00:00,28998349,low,36.2
10,64.0,96.0,73.0,19.0,123.0,98.0,2116-12-07 22:00:00,28998349,low,38.7
11,64.0,106.0,78.0,13.0,117.0,97.0,2116-12-08 11:37:00,28998349,low,36.9
12,60.0,91.0,69.0,18.0,117.0,98.0,2116-12-07 23:00:00,28998349,low,38.2
...,...,...,...,...,...,...,...,...,...,...
462,48.0,100.0,63.0,24.0,108.0,98.0,2117-03-24 06:00:00,24540843,low,38.7
463,59.0,103.0,73.0,26.0,126.0,97.0,2117-03-24 00:00:00,24540843,low,38.4
464,57.0,112.0,73.0,29.0,126.0,97.0,2117-03-22 01:00:00,24540843,low,37.9
465,58.0,117.0,76.0,24.0,97.0,99.0,2117-03-24 02:00:00,24540843,low,38.5


In [8]:
test_output = fluid_output_trianing_data.dropna()
print(len(test_output))
test_output

387


Unnamed: 0,diastolic_arterial_pressure,mean_arterial_pressure,respiratory_rate,systolic_arterial_pressure,heart_rate,spo2,timestamp,hadm_id,label,temperature
25,72.0,87.0,15.0,116.0,112.0,99.0,2116-12-08 09:00:00,28998349,high,36.4
26,75.0,90.0,13.0,117.0,110.0,97.0,2116-12-08 08:00:00,28998349,high,36.2
27,64.0,78.0,13.0,106.0,117.0,97.0,2116-12-08 11:37:00,28998349,high,36.9
28,64.0,78.0,13.0,106.0,117.0,97.0,2116-12-08 11:37:00,28998349,high,36.9
29,64.0,73.0,19.0,96.0,123.0,98.0,2116-12-07 22:00:00,28998349,high,38.7
...,...,...,...,...,...,...,...,...,...,...
463,55.0,71.0,26.0,112.0,109.0,97.0,2117-03-23 06:00:00,24540843,high,38.2
464,50.0,65.0,26.0,103.0,89.0,97.0,2117-03-23 02:00:00,24540843,high,38.0
465,55.0,69.0,30.0,98.0,117.0,97.0,2117-03-21 17:00:00,24540843,high,37.8
466,51.0,64.0,30.0,99.0,127.0,97.0,2117-03-21 15:00:00,24540843,high,38.1


In [9]:
test_normal = normal_fluid_trianing_data.dropna()
print(len(test_normal))
test_normal
# normal_fluid_trianing_data

27


Unnamed: 0,respiratory_rate,mean_arterial_pressure,heart_rate,spo2,diastolic_arterial_pressure,systolic_arterial_pressure,timestamp,hadm_id,label,temperature
3,24.0,69.0,82.0,100.0,49.0,111.0,2146-06-23 01:00:00,22987108,normal,32.6
4,24.0,75.0,92.0,98.0,54.0,121.0,2146-06-23 09:00:00,22987108,normal,35.8
5,24.0,75.0,91.0,98.0,56.0,120.0,2146-06-23 10:00:00,22987108,normal,35.8
6,24.0,65.0,81.0,96.0,50.0,102.0,2146-06-23 16:00:00,22987108,normal,35.6
7,24.0,78.0,73.0,98.0,58.0,122.0,2146-06-23 18:00:00,22987108,normal,35.3
8,24.0,74.0,77.0,98.0,54.0,120.0,2146-06-23 21:00:00,22987108,normal,37.2
9,35.0,70.0,76.0,97.0,50.0,120.0,2185-01-19 07:00:00,27617929,normal,37.3
10,0.0,68.0,82.0,93.0,50.0,116.0,2185-01-19 10:00:00,27617929,normal,37.7
11,35.0,66.0,82.0,94.0,48.0,111.0,2185-01-19 12:00:00,27617929,normal,38.0
12,29.0,75.0,74.0,96.0,55.0,127.0,2185-01-20 16:00:00,27617929,normal,37.3


In [178]:
removed_columns_input = test_input.drop(columns=['label'])
removed_columns_output = test_output.drop(columns=['label'])
removed_columns_normal = test_normal.drop(columns=['label'])

set_of_id_time_keys_fluid_input = set((row.hadm_id, row.timestamp) for row in removed_columns_input.itertuples())

count = 0
for row in removed_columns_output.itertuples():
    if (row.hadm_id, row.timestamp) in set_of_id_time_keys_fluid_input:
        count += 1

print("Number of fluid input event rows that are the same as fluid output even rows: ", count)

Number of fluid input event rows that are the same as fluid output even rows:  256


In [179]:
print("Total number of datapoints: ", len(test_input) + len(test_output) + len(test_normal))

Total number of datapoints:  1101


In [None]:
unique_timestamps = chart_df['charttime'].unique()
print(len(unique_timestamps))

chart_df.info()

24081
<class 'pandas.core.frame.DataFrame'>
Index: 162571 entries, 1 to 668745
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   subject_id    162571 non-null  int64         
 1   hadm_id       162571 non-null  int64         
 2   stay_id       162571 non-null  int64         
 3   caregiver_id  139018 non-null  float64       
 4   charttime     162571 non-null  datetime64[ns]
 5   storetime     162571 non-null  datetime64[ns]
 6   itemid        162571 non-null  int64         
 7   value         162571 non-null  object        
 8   valuenum      162571 non-null  float64       
 9   valueuom      162571 non-null  object        
dtypes: datetime64[ns](2), float64(3), int64(4), object(2)
memory usage: 14.9+ MB
935


In [None]:
# initalize lists that will be used to hold dictionarys in order to later create a dataframe
fluid_input_rows = []
fluid_output_rows = []
normal_rows = []

# get a list of all patient ids
patient_ids = patient_stays["hadm_id"].unique()

# set the required number of vitals needed for a row
required_vitals_num = 7

total_time = []

# process each patient iteratively
for patient_id in patient_ids:

    # initalize a new array each iteration to hold the vitals timestamps for processing the normal events
    timestamps = []

    # get the fluid input events assocaited with the patient
    patient_input_events = fluid_input_df[
        fluid_input_df['hadm_id'].astype(str) == str(patient_id)
    ]

    # get the fluid output events (dialysis) assocaited with the patient
    patient_output_events = fluid_output_df[
        fluid_output_df['hadm_id'].astype(str) == str(patient_id)
    ]

    # get all chart events (vitals) associated with the patient (only chart events we will use as features in the model)
    patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == str(patient_id)) &
        (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
    ]

    # === Process Fluid Inpout Events (Labeled low) ===
    for input_event in patient_input_events.itertuples():
        curr_row_timestamp = input_event.starttime

        # look at the chart events 30 minutes prior to the fluid input event
        close_chart_events = patient_chart_events[
            (patient_chart_events['charttime'] <= curr_row_timestamp) &
            (patient_chart_events['charttime'] > curr_row_timestamp-timedelta(minutes=30))
        ]

        # skip if there are none
        if close_chart_events.empty:
            continue

        # go through each unique charttime found 
        for time in close_chart_events['charttime'].unique():
            new_row = {}

            # TODO: implment the window of 5 minutes here. Also probably want to introbute a hashtable
            # to keep track of which chart rows we have already used. 
            chart_event_rows_per_timestamp = close_chart_events[close_chart_events['charttime'] == time]
            
            # if there are not enough vitals found, skip 
            if len(chart_event_rows_per_timestamp) < required_vitals_num:
                continue

            for row in chart_event_rows_per_timestamp.itertuples():
                new_row[vitals_item_ids[row.itemid]] = row.valuenum

            # add the timestamp, patient identifier, and label to the row
            new_row['timestamp'] = time
            new_row['hadm_id'] = patient_id
            new_row['label'] = "low"
        
            # store the timestamp for later processing and the data associated with the row
            timestamps.append(time)
            fluid_input_rows.append(new_row)