In [225]:
import os
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
from datetime import datetime, timedelta

# load in the csv files as dataframes
BASE_DIR = os.path.expanduser("~/Fluid-Solutions-ML/data/raw/")
items_df = pd.read_csv(os.path.join(BASE_DIR, "d_items.csv"))
chart_df = pd.read_csv(os.path.join(BASE_DIR, "chartevents.csv"))
fluid_input_df = pd.read_csv(os.path.join(BASE_DIR, "inputevents.csv"))
fluid_output_df = pd.read_csv(os.path.join(BASE_DIR, "outputevents.csv"))
patient_stays = pd.read_csv(os.path.join(BASE_DIR, "icustays.csv"))

In [226]:
# convert the time-related columns to datetime objects so I can apply some logic on them later
def convert_timestamps(dataframe, time_columns):
    for col in time_columns:
        if col in dataframe.columns:
            # dataframe[col] = pd.to_datetime(dataframe[col], errors='coerce').dt.round("min")
            dataframe[col] = pd.to_datetime(dataframe[col], errors='coerce')
    return dataframe

chart_df = convert_timestamps(chart_df, ['charttime', 'storetime'])
fluid_input_df = convert_timestamps(fluid_input_df, ['starttime', 'endtime', 'storetime'])
fluid_output_df = convert_timestamps(fluid_output_df, ['charttime', 'charttime'])

In [227]:
vitals_item_ids = {
    220045: "heart_rate",
    220277: "spo2",
    225309: "systolic_arterial_pressure",
    220050: "systolic_arterial_pressure",
    225310: "diastolic_arterial_pressure",
    220051: "diastolic_arterial_pressure",
    220052: "mean_arterial_pressure",
    225312: "mean_arterial_pressure",
    220210: "respiratory_rate",
    # 223762: "temperature"
}

# vitals_item_ids.keys()

In [228]:
# for now, I'm just going to focus on the fluid input events:

# cleanup the data in the fluid input table
fluid_input_df = fluid_input_df.dropna(subset=['itemid', 'amount', 'starttime', 'endtime'])

# drop all rows when the administered volume is 0 and make sure we are only looking at volumes/fluids
fluid_input_df = fluid_input_df[
    (fluid_input_df['amount'] > 0) &
    (fluid_input_df['amountuom'].astype(str) == "ml")
]

# cleanup the data in the chart evens table 
chart_df = chart_df.dropna(subset=['itemid', 'charttime', 'valuenum'])

fluid_output_df = fluid_output_df[
    (fluid_output_df['value'] > 0) &
    (fluid_output_df['valueuom'].astype(str) == "ml")
]

In [229]:
# set the required number of vitals needed for a row
required_vitals_num = 6

def process_event(curr_row_timestamp, patient_chart_events, patient_id, label, patient_timestamps, in_out_rows, time_diff):
    close_chart_events = patient_chart_events[
        # (patient_chart_events['charttime'] <= curr_row_timestamp) & 
        # (patient_chart_events['charttime'] > curr_row_timestamp - timedelta(minutes=time_diff))
        (patient_chart_events['charttime'].between(curr_row_timestamp - timedelta(minutes=time_diff), curr_row_timestamp))
    ]

    if close_chart_events.empty:
        return

    for time in close_chart_events['charttime'].unique():
        if label == "high" and time in patient_timestamps:
            continue  # Skip if already processed for high-labeled events

        new_row = {}
        chart_event_rows_per_timestamp = close_chart_events[close_chart_events['charttime'] == time]

        if len(chart_event_rows_per_timestamp) < required_vitals_num:
            continue

        if time in patient_timestamps:
            continue

        for row in chart_event_rows_per_timestamp.itertuples():
            new_row[vitals_item_ids[row.itemid]] = row.valuenum

        new_row['timestamp'] = time
        new_row['hadm_id'] = patient_id
        new_row['label'] = label

        patient_timestamps.append(time)
        in_out_rows.append(new_row)

def not_within_time_prior(row_time, timestamps, time_diff=30):
    timestamps = pd.to_datetime(timestamps)
    # return all timestamps that are not 30 minutes prior to the fluid event time
    return not ((timestamps - pd.Timedelta(minutes=time_diff)) < row_time).any()
    # for ts in pd.to_datetime(timestamps):
    #     if ts - pd.Timedelta(minutes=time_diff) <= row_time <= ts:
    #         return False
    # return True

In [230]:
# initalize lists that will be used to hold dictionarys in order to later create a dataframe
in_out_rows = []
normal_rows = []

time_diff = 15

# get a list of all patient ids
patient_ids = patient_stays["hadm_id"].unique()


for patient_id in patient_ids:
    patient_timestamps = []

    # get the fluid input events assocaited with the patient
    patient_input_events = fluid_input_df[
        fluid_input_df['hadm_id'].astype(str) == str(patient_id)
    ]

    # get the fluid output events (dialysis) assocaited with the patient
    patient_output_events = fluid_output_df[
        fluid_output_df['hadm_id'].astype(str) == str(patient_id)
    ]

    # get all chart events (vitals) associated with the patient (only chart events we will use as features in the model)
    patient_chart_events = chart_df[
        (chart_df['hadm_id'].astype(str) == str(patient_id)) &
        (chart_df['itemid'].astype(int).isin(vitals_item_ids.keys()))
    ]

    # === Get all in AND out events and add to in_out_rows ===
    
    # === Process Fluid Input Events (labeled low) ===
    for input_event in patient_input_events.itertuples():
        curr_row_timestamp = input_event.starttime
        process_event(
            curr_row_timestamp,
            patient_chart_events,
            patient_id,
            "low",
            patient_timestamps,
            in_out_rows,
            time_diff=time_diff
        )

    # === Process Fluid Output Events (Labeled high) ===
    for output_event in patient_output_events.itertuples():
        curr_row_timestamp = output_event.charttime
        process_event(
            curr_row_timestamp,
            patient_chart_events,
            patient_id,
            "high",
            patient_timestamps,
            in_out_rows,
            time_diff=time_diff
        )

    # === Process Normal fluid events ===
    if not patient_timestamps:
        normal_fluid_events = patient_chart_events
    else:
        normal_fluid_events = patient_chart_events[
            patient_chart_events['charttime'].apply(lambda row_time: not_within_time_prior(row_time, patient_timestamps,time_diff))
        ]

    if normal_fluid_events.empty:
        continue

    for time in normal_fluid_events['charttime'].unique():
        new_row = {}
        normal_fluid_chart_events_at_time = normal_fluid_events[
            normal_fluid_events['charttime'] == time
        ]
        
        for chart_row in normal_fluid_chart_events_at_time.itertuples():
            new_row[vitals_item_ids[chart_row.itemid]] = chart_row.valuenum

        new_row['timestamp'] = time
        new_row['hadm_id'] = patient_id
        new_row['label'] = "normal"

        normal_rows.append(new_row)


fluid_in_out_data = pd.DataFrame(in_out_rows)
normal_fluid_data = pd.DataFrame(normal_rows)

In [231]:
fluid_in_out_data.dropna(inplace=True)
fluid_in_out_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3719 entries, 0 to 3723
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   respiratory_rate             3719 non-null   float64       
 1   heart_rate                   3719 non-null   float64       
 2   mean_arterial_pressure       3719 non-null   float64       
 3   diastolic_arterial_pressure  3719 non-null   float64       
 4   systolic_arterial_pressure   3719 non-null   float64       
 5   spo2                         3719 non-null   float64       
 6   timestamp                    3719 non-null   datetime64[ns]
 7   hadm_id                      3719 non-null   int64         
 8   label                        3719 non-null   object        
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 290.5+ KB


In [232]:
normal_fluid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5920 entries, 0 to 5919
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   mean_arterial_pressure       134 non-null    float64       
 1   diastolic_arterial_pressure  111 non-null    float64       
 2   systolic_arterial_pressure   111 non-null    float64       
 3   heart_rate                   5356 non-null   float64       
 4   respiratory_rate             5371 non-null   float64       
 5   spo2                         5194 non-null   float64       
 6   timestamp                    5920 non-null   datetime64[ns]
 7   hadm_id                      5920 non-null   int64         
 8   label                        5920 non-null   object        
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 416.4+ KB


In [233]:
# TODO: separate fluid_in_out_data into fluid_input and fluid_output tables based on the time and how close it is to the input/output event
fluid_input = []
fluid_output = []

for row in fluid_in_out_data.itertuples(index=False):
    row_dict = row._asdict()

    patient_id = row_dict.get('hadm_id')
    row_timestamp = row_dict.get('timestamp')

    row_dict['pulse_pressure'] = row_dict['systolic_arterial_pressure'] - row_dict['diastolic_arterial_pressure']

    closest_input_events = fluid_input_df[
        (fluid_input_df['hadm_id'].astype(str) == str(patient_id)) &
        (fluid_input_df['starttime'].between(row_timestamp, row_timestamp + timedelta(minutes=time_diff)))
    ]

    closest_output_events = fluid_output_df[
        (fluid_output_df['hadm_id'].astype(str) == str(patient_id)) & 
        (fluid_output_df['charttime'].between(row_timestamp, row_timestamp + timedelta(minutes=time_diff)))
    ]

    if not closest_input_events.empty and not closest_output_events.empty:
        # TODO: Determine which time is closer
        min_input_diff = (closest_input_events['starttime'] - row_timestamp).abs().min()
        min_output_diff = (closest_output_events['charttime'] - row_timestamp).abs().min()

        # Determine which one is closer
        if min_input_diff < min_output_diff:
            # fluid input event is closer, therefore label low
            row_dict['label'] = 'low'
            fluid_input.append(row_dict)
        else:
            # fluid output event is closer, therefore, label high
            row_dict['label'] = 'high'
            fluid_output.append(row_dict)

    elif not closest_input_events.empty:
        # there are no input events
        row_dict['label'] = 'low'
        fluid_input.append(row_dict)

    elif not closest_output_events.empty:
        # add the row to the output events
        row_dict['label'] = 'high'
        fluid_output.append(row_dict)

final_fluid_input_df = pd.DataFrame(fluid_input)
final_fluid_output_df = pd.DataFrame(fluid_output)

if (len(final_fluid_input_df) + len(final_fluid_output_df)) != len(fluid_in_out_data):
    print("-- Parsing failed, dataframe sizes do not match --")
    raise Exception("failed to accuractely parse data")

In [234]:
final_fluid_input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   respiratory_rate             714 non-null    float64       
 1   heart_rate                   714 non-null    float64       
 2   mean_arterial_pressure       714 non-null    float64       
 3   diastolic_arterial_pressure  714 non-null    float64       
 4   systolic_arterial_pressure   714 non-null    float64       
 5   spo2                         714 non-null    float64       
 6   timestamp                    714 non-null    datetime64[ns]
 7   hadm_id                      714 non-null    int64         
 8   label                        714 non-null    object        
 9   pulse_pressure               714 non-null    float64       
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 55.9+ KB


In [235]:
final_fluid_output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   respiratory_rate             3005 non-null   float64       
 1   heart_rate                   3005 non-null   float64       
 2   mean_arterial_pressure       3005 non-null   float64       
 3   diastolic_arterial_pressure  3005 non-null   float64       
 4   systolic_arterial_pressure   3005 non-null   float64       
 5   spo2                         3005 non-null   float64       
 6   timestamp                    3005 non-null   datetime64[ns]
 7   hadm_id                      3005 non-null   int64         
 8   label                        3005 non-null   object        
 9   pulse_pressure               3005 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 234.9+ KB


In [None]:
# TODO: pad the normal table NA values with randmon values based on the averages
import numpy as np # using np.random becuase it is more accurate and better

# add pulse pressure to the normal dataframe:
normal_fluid_data['pulse_pressure'] = normal_fluid_data['systolic_arterial_pressure'] - normal_fluid_data['diastolic_arterial_pressure']

# create normal distributions centered around the mean of the current column in the normal_fluid_dataframe
distributions = {
    'respiratory_rate' : lambda: np.random.normal(loc=normal_fluid_data['respiratory_rate'].mean()),
    'heart_rate' : lambda: np.random.normal(loc=normal_fluid_data['heart_rate'].mean()),
    'mean_arterial_pressure' : lambda: np.random.normal(loc=normal_fluid_data['mean_arterial_pressure'].mean()),
    'diastolic_arterial_pressure' : lambda: np.random.normal(loc=normal_fluid_data['diastolic_arterial_pressure'].mean()),
    'systolic_arterial_pressure' : lambda: np.random.normal(loc=normal_fluid_data['systolic_arterial_pressure'].mean()),
    'spo2' : lambda: np.random.normal(loc=normal_fluid_data['spo2'].mean()),
    'pulse_pressure' : lambda: np.random.normal(loc=normal_fluid_data['pulse_pressure'].mean())
}

for key, value in distributions.items():
    normal_fluid_data[key] = normal_fluid_data[key].apply(lambda x: value() if pd.isna(x) else x)

normal_fluid_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, 0 to 4446
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   mean_arterial_pressure       25 non-null     float64       
 1   diastolic_arterial_pressure  25 non-null     float64       
 2   systolic_arterial_pressure   25 non-null     float64       
 3   heart_rate                   25 non-null     float64       
 4   respiratory_rate             25 non-null     float64       
 5   spo2                         25 non-null     float64       
 6   timestamp                    25 non-null     datetime64[ns]
 7   hadm_id                      25 non-null     int64         
 8   label                        25 non-null     object        
 9   pulse_pressure               25 non-null     float64       
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 2.1+ KB


In [237]:
training_data = pd.concat([final_fluid_input_df, final_fluid_output_df, normal_fluid_data], ignore_index=True)
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3744 entries, 0 to 3743
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   respiratory_rate             3744 non-null   float64       
 1   heart_rate                   3744 non-null   float64       
 2   mean_arterial_pressure       3744 non-null   float64       
 3   diastolic_arterial_pressure  3744 non-null   float64       
 4   systolic_arterial_pressure   3744 non-null   float64       
 5   spo2                         3744 non-null   float64       
 6   timestamp                    3744 non-null   datetime64[ns]
 7   hadm_id                      3744 non-null   int64         
 8   label                        3744 non-null   object        
 9   pulse_pressure               3744 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 292.6+ KB


In [238]:
final_training_df = training_data.drop(columns=['hadm_id', 'timestamp']).round(1)
# final_training_df = training_data.round(1)
final_training_df.info()
# final_training_df.to_csv("~/Fluid-Solutions-ML/data/processed/LSTM_FINAL_TRAINING_DATA.csv")
final_training_df.to_csv("~/Fluid-Solutions-ML/data/processed/maybe.csv")
# final_training_df.to_csv("~/Fluid-Solutions-ML/data/processed/maybe_10_min_update.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3744 entries, 0 to 3743
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respiratory_rate             3744 non-null   float64
 1   heart_rate                   3744 non-null   float64
 2   mean_arterial_pressure       3744 non-null   float64
 3   diastolic_arterial_pressure  3744 non-null   float64
 4   systolic_arterial_pressure   3744 non-null   float64
 5   spo2                         3744 non-null   float64
 6   label                        3744 non-null   object 
 7   pulse_pressure               3744 non-null   float64
dtypes: float64(7), object(1)
memory usage: 234.1+ KB


In [239]:
final_training_df

Unnamed: 0,respiratory_rate,heart_rate,mean_arterial_pressure,diastolic_arterial_pressure,systolic_arterial_pressure,spo2,label,pulse_pressure
0,17.0,73.0,83.0,55.0,131.0,98.0,low,76.0
1,20.0,86.0,89.0,58.0,147.0,98.0,low,89.0
2,16.0,67.0,107.0,84.0,148.0,97.0,low,64.0
3,15.0,51.0,91.0,67.0,134.0,96.0,low,67.0
4,19.0,62.0,116.0,109.0,122.0,95.0,low,13.0
...,...,...,...,...,...,...,...,...
3739,14.0,82.0,101.0,76.0,139.0,96.0,normal,63.0
3740,22.0,90.0,80.0,63.0,107.0,100.0,normal,44.0
3741,16.0,67.0,105.0,66.0,159.0,97.0,normal,93.0
3742,19.0,70.0,84.0,52.0,130.0,95.0,normal,78.0


In [240]:
low_df = final_training_df[final_training_df['label'] == 'low']
average = low_df['diastolic_arterial_pressure'].mean()
median = low_df['diastolic_arterial_pressure'].median()
print(f"Average DAP: {average} \nMedian DAP: {median}")

Average DAP: 58.128851540616246 
Median DAP: 57.0


In [241]:
high_df = final_training_df[final_training_df['label'] == 'high']
average = high_df['diastolic_arterial_pressure'].mean()
median = high_df['diastolic_arterial_pressure'].median()
print(f"Average DAP: {average} \nMedian DAP: {median}")

Average DAP: 59.4279534109817 
Median DAP: 59.0


In [242]:
normal_df = final_training_df[final_training_df['label'] == 'normal']
average = normal_df['diastolic_arterial_pressure'].mean()
median = normal_df['diastolic_arterial_pressure'].median()
print(f"Average DAP: {average} \nMedian DAP: {median}")

Average DAP: 67.28 
Median DAP: 67.0


In [243]:
print(f"Low df count: {len(low_df)} \nHigh df count: {len(high_df)} \nNormal df count: {len(final_training_df)-len(low_df)-len(high_df)}")

Low df count: 714 
High df count: 3005 
Normal df count: 25
