In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/inputevents.csv'

# Load the CSV into a DataFrame
input = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
print(input.head())

   subject_id   stay_id            starttime              endtime  itemid  \
0    10002428  34807493  2156-05-01 08:00:00  2156-05-01 08:01:00  225799   
1    10002428  34807493  2156-05-01 08:00:00  2156-05-01 08:01:00  225975   
2    10002428  34807493  2156-05-01 08:00:00  2156-05-01 08:01:00  226453   
3    10002428  34807493  2156-05-01 10:00:00  2156-05-01 10:01:00  220949   
4    10002428  34807493  2156-05-01 10:00:00  2156-05-01 10:01:00  225884   

   amount amountuom  totalamount totalamountuom  rate rateuom  
0    30.0        mL         30.0             mL   NaN     NaN  
1     1.0      dose          NaN            NaN   NaN     NaN  
2    50.0        mL         50.0             mL   NaN     NaN  
3   100.0        mL        100.0             mL   NaN     NaN  
4     1.0      dose        100.0             mL   NaN     NaN  


In [3]:
input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3174060 entries, 0 to 3174059
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   subject_id      int64  
 1   stay_id         int64  
 2   starttime       object 
 3   endtime         object 
 4   itemid          int64  
 5   amount          float64
 6   amountuom       object 
 7   totalamount     float64
 8   totalamountuom  object 
 9   rate            float64
 10  rateuom         object 
dtypes: float64(3), int64(3), object(5)
memory usage: 266.4+ MB


In [4]:
# Convert starttime and endttime to an datetime
input['starttime'] = input['starttime'].astype('datetime64[ns]')
input['endtime'] = input['endtime'].astype('datetime64[ns]')
input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3174060 entries, 0 to 3174059
Data columns (total 11 columns):
 #   Column          Dtype         
---  ------          -----         
 0   subject_id      int64         
 1   stay_id         int64         
 2   starttime       datetime64[ns]
 3   endtime         datetime64[ns]
 4   itemid          int64         
 5   amount          float64       
 6   amountuom       object        
 7   totalamount     float64       
 8   totalamountuom  object        
 9   rate            float64       
 10  rateuom         object        
dtypes: datetime64[ns](2), float64(3), int64(3), object(3)
memory usage: 266.4+ MB


In [5]:
# Count the number of unique subject_ids / patients in this dataframe
print(f'Number of patients: {input["subject_id"].nunique()}')

Number of patients: 5832


In [6]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column into the ventilation dataframe using itemid
input = input.merge(
    d_items[['itemid', 'label']],  # Only keep necessary columns
    on='itemid',
    how='left'  # Keep all rows from ventilation
)

In [7]:
input.head(50)

Unnamed: 0,subject_id,stay_id,starttime,endtime,itemid,amount,amountuom,totalamount,totalamountuom,rate,rateuom,label
0,10002428,34807493,2156-05-01 08:00:00,2156-05-01 08:01:00,225799,30.0,mL,30.0,mL,,,Gastric Meds
1,10002428,34807493,2156-05-01 08:00:00,2156-05-01 08:01:00,225975,1.0,dose,,,,,Heparin Sodium (Prophylaxis)
2,10002428,34807493,2156-05-01 08:00:00,2156-05-01 08:01:00,226453,50.0,mL,50.0,mL,,,GT Flush
3,10002428,34807493,2156-05-01 10:00:00,2156-05-01 10:01:00,220949,100.0,mL,100.0,mL,,,Dextrose 5%
4,10002428,34807493,2156-05-01 10:00:00,2156-05-01 10:01:00,225884,1.0,dose,100.0,mL,,,Metronidazole
5,10002428,34807493,2156-05-01 11:15:00,2156-05-01 11:16:00,225158,500.0,mL,500.0,mL,,,NaCl 0.9%
6,10002428,34807493,2156-05-01 12:00:00,2156-05-01 12:01:00,225798,1.0,dose,,,,,Vancomycin
7,10002428,34807493,2156-05-01 12:00:00,2156-05-01 12:01:00,225799,10.0,mL,10.0,mL,,,Gastric Meds
8,10002428,34807493,2156-05-01 12:00:00,2156-05-01 12:01:00,226453,30.0,mL,30.0,mL,,,GT Flush
9,10002428,34807493,2156-05-01 13:12:00,2156-05-01 13:13:00,220949,50.0,mL,50.0,mL,,,Dextrose 5%


In [8]:
input.shape

(3174060, 12)

In [9]:
# Group by itemid and count nulls in both 'amount' and 'totalamount' columns
null_counts = input.groupby('itemid').agg(
    null_amount=('amount', lambda x: x.isnull().sum()),
    null_totalamount=('totalamount', lambda x: x.isnull().sum())
).reset_index()

null_counts

Unnamed: 0,itemid,null_amount,null_totalamount
0,220862,0,0
1,220864,0,0
2,220949,0,23
3,220950,0,0
4,220952,0,4699
...,...,...,...
306,229760,0,0
307,229764,0,0
308,229781,0,0
309,229861,0,0


In [10]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
null_counts = null_counts.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

null_counts

Unnamed: 0,itemid,null_amount,null_totalamount,label
0,220862,0,0,Albumin 25%
1,220864,0,0,Albumin 5%
2,220949,0,23,Dextrose 5%
3,220950,0,0,Dextrose 10%
4,220952,0,4699,Dextrose 50%
...,...,...,...,...
306,229760,0,0,Epoprostenol (Veletri)
307,229764,0,0,Angiotensin II (Giapreza)
308,229781,0,0,Bivalirudin (Angiomax) (Impella)
309,229861,0,0,Sodium Acetate.


In [11]:
filtered_null_counts = null_counts[
    (null_counts['null_amount'] > 0) | (null_counts['null_totalamount'] > 0)
]

filtered_null_counts

Unnamed: 0,itemid,null_amount,null_totalamount,label
2,220949,0,23,Dextrose 5%
4,220952,0,4699,Dextrose 50%
10,221282,0,135,Adenosine
14,221347,0,1380,Amiodarone
15,221385,0,12928,Lorazepam (Ativan)
...,...,...,...,...
296,229616,0,185,Ondansetron (Zofran)
297,229617,0,23,Epinephrine.
298,229618,0,27,Calcium Chloride
299,229619,0,3,Insulin - U500


In [12]:
pd.set_option('display.max_rows', None)     # Show all rows

filtered_null_counts

Unnamed: 0,itemid,null_amount,null_totalamount,label
2,220949,0,23,Dextrose 5%
4,220952,0,4699,Dextrose 50%
10,221282,0,135,Adenosine
14,221347,0,1380,Amiodarone
15,221385,0,12928,Lorazepam (Ativan)
16,221393,0,151,Atropine
19,221468,0,1634,Diltiazem
20,221555,0,504,Cisatracurium
21,221623,0,1344,Diazepam (Valium)
24,221668,0,31802,Midazolam (Versed)


In [13]:
null_counts

Unnamed: 0,itemid,null_amount,null_totalamount,label
0,220862,0,0,Albumin 25%
1,220864,0,0,Albumin 5%
2,220949,0,23,Dextrose 5%
3,220950,0,0,Dextrose 10%
4,220952,0,4699,Dextrose 50%
5,220970,0,0,Fresh Frozen Plasma
6,220995,0,0,Sodium Bicarbonate 8.4%
7,221036,0,0,Nutren Renal (Full)
8,221207,0,0,Impact (Full)
9,221261,0,0,Abciximab (Reopro)


In [14]:
input2 = input.copy()

pd.reset_option('display.max_rows')

In [15]:
input2 = input2.dropna(subset=['totalamount'], how='all')

input2.head(10)

Unnamed: 0,subject_id,stay_id,starttime,endtime,itemid,amount,amountuom,totalamount,totalamountuom,rate,rateuom,label
0,10002428,34807493,2156-05-01 08:00:00,2156-05-01 08:01:00,225799,30.0,mL,30.0,mL,,,Gastric Meds
2,10002428,34807493,2156-05-01 08:00:00,2156-05-01 08:01:00,226453,50.0,mL,50.0,mL,,,GT Flush
3,10002428,34807493,2156-05-01 10:00:00,2156-05-01 10:01:00,220949,100.0,mL,100.0,mL,,,Dextrose 5%
4,10002428,34807493,2156-05-01 10:00:00,2156-05-01 10:01:00,225884,1.0,dose,100.0,mL,,,Metronidazole
5,10002428,34807493,2156-05-01 11:15:00,2156-05-01 11:16:00,225158,500.0,mL,500.0,mL,,,NaCl 0.9%
7,10002428,34807493,2156-05-01 12:00:00,2156-05-01 12:01:00,225799,10.0,mL,10.0,mL,,,Gastric Meds
8,10002428,34807493,2156-05-01 12:00:00,2156-05-01 12:01:00,226453,30.0,mL,30.0,mL,,,GT Flush
9,10002428,34807493,2156-05-01 13:12:00,2156-05-01 13:13:00,220949,50.0,mL,50.0,mL,,,Dextrose 5%
10,10002428,34807493,2156-05-01 13:12:00,2156-05-01 13:13:00,225851,1.0,dose,50.0,mL,,,Cefepime
11,10002428,34807493,2156-05-01 13:45:00,2156-05-01 13:46:00,220949,200.0,mL,200.0,mL,,,Dextrose 5%


In [16]:
input2.shape

(2686843, 12)

In [17]:
# Average, minimum and maximum time of each individual in dataframe

# Group by subject_id and calculate duration per subject
duration_per_subject = input2.groupby('subject_id')['starttime'].agg(['min', 'max'])
duration_per_subject['duration_hours'] = (
    (duration_per_subject['max'] - duration_per_subject['min']).dt.total_seconds() / 3600
)

# Summary statistics
average_duration = duration_per_subject['duration_hours'].mean()
min_duration = duration_per_subject['duration_hours'].min()
max_duration = duration_per_subject['duration_hours'].max()

print(f"🟢 Average time span per subject: {average_duration:.2f} hours")
print(f"🔵 Shortest time span: {min_duration:.2f} hours")
print(f"🔴 Longest time span: {max_duration:.2f} hours")

🟢 Average time span per subject: 6389.53 hours
🔵 Shortest time span: 10.77 hours
🔴 Longest time span: 112126.53 hours


In [18]:
# Group by itemid
grouped = input2.groupby('itemid')

# Define the frequency function
def avg_obs_per_20_hours(group):
    n_obs = len(group)
    time_span_hours = (group['starttime'].max() - group['starttime'].min()).total_seconds() / 3600
    return (n_obs / time_span_hours * 20) if time_span_hours > 0 else float('nan')

# Apply the function to the cleaned dataframe
avg_freq_df = grouped.apply(avg_obs_per_20_hours).reset_index(name='avg_obs_per_20_hours')

avg_freq_df

  avg_freq_df = grouped.apply(avg_obs_per_20_hours).reset_index(name='avg_obs_per_20_hours')


Unnamed: 0,itemid,avg_obs_per_20_hours
0,220862,0.229405
1,220864,0.178425
2,220949,8.206738
3,220950,0.038730
4,220970,0.160001
...,...,...
263,229760,0.000502
264,229764,0.001519
265,229781,0.001145
266,229861,


In [19]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
avg_freq_df = avg_freq_df.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

avg_freq_df

Unnamed: 0,itemid,avg_obs_per_20_hours,label
0,220862,0.229405,Albumin 25%
1,220864,0.178425,Albumin 5%
2,220949,8.206738,Dextrose 5%
3,220950,0.038730,Dextrose 10%
4,220970,0.160001,Fresh Frozen Plasma
...,...,...,...
263,229760,0.000502,Epoprostenol (Veletri)
264,229764,0.001519,Angiotensin II (Giapreza)
265,229781,0.001145,Bivalirudin (Angiomax) (Impella)
266,229861,,Sodium Acetate.


In [20]:
# Dropping features that are measured < 0.1 times per 20-hour period

# Identify low-frequency itemids
low_freq_itemids = avg_freq_df[avg_freq_df['avg_obs_per_20_hours'] < 0.1]['itemid'].tolist()

# Drop them from input2
input3 = input2[~input2['itemid'].isin(low_freq_itemids)].copy()

In [21]:
input3.head()

Unnamed: 0,subject_id,stay_id,starttime,endtime,itemid,amount,amountuom,totalamount,totalamountuom,rate,rateuom,label
0,10002428,34807493,2156-05-01 08:00:00,2156-05-01 08:01:00,225799,30.0,mL,30.0,mL,,,Gastric Meds
2,10002428,34807493,2156-05-01 08:00:00,2156-05-01 08:01:00,226453,50.0,mL,50.0,mL,,,GT Flush
3,10002428,34807493,2156-05-01 10:00:00,2156-05-01 10:01:00,220949,100.0,mL,100.0,mL,,,Dextrose 5%
4,10002428,34807493,2156-05-01 10:00:00,2156-05-01 10:01:00,225884,1.0,dose,100.0,mL,,,Metronidazole
5,10002428,34807493,2156-05-01 11:15:00,2156-05-01 11:16:00,225158,500.0,mL,500.0,mL,,,NaCl 0.9%


In [22]:
input3.shape

(2505890, 12)

In [23]:
input3 = input3[['subject_id', 'stay_id', 'starttime', 'itemid', 'totalamount', 'totalamountuom', 'label']]

input3.head()

Unnamed: 0,subject_id,stay_id,starttime,itemid,totalamount,totalamountuom,label
0,10002428,34807493,2156-05-01 08:00:00,225799,30.0,mL,Gastric Meds
2,10002428,34807493,2156-05-01 08:00:00,226453,50.0,mL,GT Flush
3,10002428,34807493,2156-05-01 10:00:00,220949,100.0,mL,Dextrose 5%
4,10002428,34807493,2156-05-01 10:00:00,225884,100.0,mL,Metronidazole
5,10002428,34807493,2156-05-01 11:15:00,225158,500.0,mL,NaCl 0.9%


In [24]:
# Check for outliers in the dataframe

summary_stats = input3.groupby('itemid')['totalamount'].agg(
    min_value='min',
    max_value='max',
    median_value='median'
).reset_index()

# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
summary_stats = summary_stats.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

print(summary_stats)

    itemid  min_value  max_value  median_value                       label
0   220862     10.000     1000.0          50.0                 Albumin 25%
1   220864     41.667     1000.0         250.0                  Albumin 5%
2   220949      0.500    10000.0         200.0                 Dextrose 5%
3   220970     30.000     3266.0         298.0         Fresh Frozen Plasma
4   221289     50.000      250.0         250.0                 Epinephrine
..     ...        ...        ...           ...                         ...
73  228364   1000.000     1000.0        1000.0            Two Cal HN (1/2)
74  229011     40.000     9000.0        1000.0           Jevity 1.5 (Full)
75  229420     30.000      250.0         100.0  Dexmedetomidine (Precedex)
76  229630    250.000      250.0         250.0      Phenylephrine (50/250)
77  229861     50.000       50.0          50.0             Sodium Acetate.

[78 rows x 5 columns]


In [25]:
# Subsetting data to identify reference ranges

input3_subset = input3[input3['itemid'] == 229011]
input3_subset

Unnamed: 0,subject_id,stay_id,starttime,itemid,totalamount,totalamountuom,label
6952,10020740,35889503,2150-03-26 20:00:00,229011,1000.0,mL,Jevity 1.5 (Full)
6966,10020740,35889503,2150-03-27 22:18:00,229011,1000.0,mL,Jevity 1.5 (Full)
6968,10020740,35889503,2150-03-28 02:16:00,229011,1000.0,mL,Jevity 1.5 (Full)
6972,10020740,35889503,2150-03-28 05:45:00,229011,1000.0,mL,Jevity 1.5 (Full)
6981,10020740,35889503,2150-03-26 16:23:00,229011,1000.0,mL,Jevity 1.5 (Full)
...,...,...,...,...,...,...,...
3173945,19998843,30988867,2187-02-07 04:38:00,229011,1000.0,mL,Jevity 1.5 (Full)
3173969,19998843,30988867,2187-02-07 10:00:00,229011,1000.0,mL,Jevity 1.5 (Full)
3173978,19998843,30988867,2187-02-07 18:05:00,229011,1000.0,mL,Jevity 1.5 (Full)
3174037,19998843,30988867,2187-02-06 12:28:00,229011,1000.0,mL,Jevity 1.5 (Full)


In [26]:
# Copy the original DataFrame
input4 = input3.copy()

# Define the reference ranges
valid_ranges = {
    220949: (0, 1000),
    220970: (0, 500),
    221906: (0, 300),
    222011: (0, 250),
    222042: (0, 300),
    222168: (0, 200),
    223258: (0, 2000),
    225152: (0, 500),
    225158: (0, 1000),
    225166: (0, 1000),
    225168: (0, 500),
    225797: (0, 500),
    225798: (0, 500),
    225799: (0, 500),
    225823: (0, 1000),
    225828: (0, 1000),
    225936: (0, 1000),
    225943: (0, 1000),
    225944: (0, 1000),
    225970: (0, 2000),
    226089: (0, 1000),
    226452: (0, 500),
    226453: (0, 500),
    226880: (0, 1000),
    227526: (0, 1000),
    227529: (0, 1000),
    228351: (0, 2000),
    229011: (0, 1000),
}

# Separate into rows with itemids in valid_ranges and the rest
to_filter = input4[input4['itemid'].isin(valid_ranges.keys())].copy()
to_keep = input4[~input4['itemid'].isin(valid_ranges.keys())].copy()

# Apply filtering to to_filter
def in_valid_range(row):
    low, high = valid_ranges[row['itemid']]
    return low <= row['totalamount'] <= high

filtered = to_filter[to_filter.apply(in_valid_range, axis=1)]

# Concatenate the filtered and unfiltered subsets
input4 = pd.concat([filtered, to_keep], ignore_index=True)

In [27]:
input4.head()

Unnamed: 0,subject_id,stay_id,starttime,itemid,totalamount,totalamountuom,label
0,10002428,34807493,2156-05-01 08:00:00,225799,30.0,mL,Gastric Meds
1,10002428,34807493,2156-05-01 08:00:00,226453,50.0,mL,GT Flush
2,10002428,34807493,2156-05-01 10:00:00,220949,100.0,mL,Dextrose 5%
3,10002428,34807493,2156-05-01 11:15:00,225158,500.0,mL,NaCl 0.9%
4,10002428,34807493,2156-05-01 12:00:00,225799,10.0,mL,Gastric Meds


In [28]:
input4.shape

(2500032, 7)

In [29]:
# Check for outliers in the dataframe

summary_stats = input4.groupby('itemid')['totalamount'].agg(
    min_value='min',
    max_value='max',
    median_value='median'
).reset_index()

# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
summary_stats = summary_stats.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

print(summary_stats)

    itemid  min_value  max_value  median_value                       label
0   220862     10.000     1000.0          50.0                 Albumin 25%
1   220864     41.667     1000.0         250.0                  Albumin 5%
2   220949      0.500     1000.0         200.0                 Dextrose 5%
3   220970     30.000      500.0         298.0         Fresh Frozen Plasma
4   221289     50.000      250.0         250.0                 Epinephrine
..     ...        ...        ...           ...                         ...
73  228364   1000.000     1000.0        1000.0            Two Cal HN (1/2)
74  229011     40.000     1000.0        1000.0           Jevity 1.5 (Full)
75  229420     30.000      250.0         100.0  Dexmedetomidine (Precedex)
76  229630    250.000      250.0         250.0      Phenylephrine (50/250)
77  229861     50.000       50.0          50.0             Sodium Acetate.

[78 rows x 5 columns]


In [30]:
input5 = input4.copy()

In [31]:
# Create dataframe which sums 24h input

# Make a copy of the dataframe
input6 = input5.copy()

# Extract the date part for grouping
input6['start_date'] = input6['starttime'].dt.date

# Group by subject_id, stay_id, and start_date
input6 = (
    input6
    .groupby(['subject_id', 'stay_id', 'start_date'])
    .agg(
        starttime=('starttime', 'min'),         # earliest time in the day
        totalamount=('totalamount', 'sum')      # total input in 24h
    )
    .reset_index()
    .rename(columns={'totalamount': '24h_input'})
)

input6.head()

Unnamed: 0,subject_id,stay_id,start_date,starttime,24h_input
0,10002428,33987268,2156-04-12,2156-04-12 17:30:00,2900.0
1,10002428,33987268,2156-04-13,2156-04-13 00:00:00,6440.0
2,10002428,33987268,2156-04-14,2156-04-14 03:04:00,6150.0
3,10002428,33987268,2156-04-15,2156-04-15 00:10:00,2040.0
4,10002428,33987268,2156-04-16,2156-04-16 05:13:00,800.0


In [32]:
input6.shape

(82296, 5)

In [33]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/outputevents.csv'

# Load the CSV into a DataFrame
output = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
print(output.head())

   subject_id   stay_id  itemid            charttime  value valueuom
0    10002428  34807493  226559  2156-05-01 07:00:00   35.0       mL
1    10002428  34807493  226559  2156-05-01 08:00:00   35.0       mL
2    10002428  34807493  226559  2156-05-01 09:00:00   35.0       mL
3    10002428  34807493  226559  2156-05-01 10:00:00   35.0       mL
4    10002428  34807493  226559  2156-05-01 11:00:00   35.0       mL


In [34]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379313 entries, 0 to 1379312
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   subject_id  1379313 non-null  int64  
 1   stay_id     1379313 non-null  int64  
 2   itemid      1379313 non-null  int64  
 3   charttime   1379313 non-null  object 
 4   value       1379313 non-null  float64
 5   valueuom    1379313 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 63.1+ MB


In [35]:
# Convert charttime to an datetime
output['charttime'] = output['charttime'].astype('datetime64[ns]')
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379313 entries, 0 to 1379312
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   subject_id  1379313 non-null  int64         
 1   stay_id     1379313 non-null  int64         
 2   itemid      1379313 non-null  int64         
 3   charttime   1379313 non-null  datetime64[ns]
 4   value       1379313 non-null  float64       
 5   valueuom    1379313 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 63.1+ MB


In [36]:
# Count the number of unique subject_ids / patients in this dataframe
print(f'Number of patients: {output["subject_id"].nunique()}')

Number of patients: 5825


In [37]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column into the ventilation dataframe using itemid
output = output.merge(
    d_items[['itemid', 'label']],  # Only keep necessary columns
    on='itemid',
    how='left'  # Keep all rows from ventilation
)

In [38]:
output.head(10)

Unnamed: 0,subject_id,stay_id,itemid,charttime,value,valueuom,label
0,10002428,34807493,226559,2156-05-01 07:00:00,35.0,mL,Foley
1,10002428,34807493,226559,2156-05-01 08:00:00,35.0,mL,Foley
2,10002428,34807493,226559,2156-05-01 09:00:00,35.0,mL,Foley
3,10002428,34807493,226559,2156-05-01 10:00:00,35.0,mL,Foley
4,10002428,34807493,226559,2156-05-01 11:00:00,35.0,mL,Foley
5,10002428,34807493,226559,2156-05-01 12:00:00,15.0,mL,Foley
6,10002428,34807493,226559,2156-05-01 13:00:00,20.0,mL,Foley
7,10002428,34807493,226559,2156-05-01 14:00:00,20.0,mL,Foley
8,10002428,34807493,226559,2156-05-01 15:00:00,30.0,mL,Foley
9,10002428,34807493,226559,2156-05-01 16:00:00,30.0,mL,Foley


In [39]:
# Group by itemid and count nulls in 'value' column
null_counts = output.groupby('itemid').agg(
    null_value=('value', lambda x: x.isnull().sum()),
).reset_index()

In [40]:
null_counts

Unnamed: 0,itemid,null_value
0,226557,0
1,226558,0
2,226559,0
3,226560,0
4,226561,0
...,...,...
66,227510,0
67,227511,0
68,227701,0
69,229413,0


In [41]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
null_counts = null_counts.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

null_counts

Unnamed: 0,itemid,null_value,label
0,226557,0,R Ureteral Stent
1,226558,0,L Ureteral Stent
2,226559,0,Foley
3,226560,0,Void
4,226561,0,Condom Cath
...,...,...,...
66,227510,0,TF Residual
67,227511,0,TF Residual Output
68,227701,0,Drainage Bag
69,229413,0,Chest Tube #3


In [42]:
filtered_null_counts = null_counts[
    (null_counts['null_value'] > 0)]

filtered_null_counts

Unnamed: 0,itemid,null_value,label


In [43]:
pd.set_option('display.max_rows', None)     # Show all rows

null_counts

Unnamed: 0,itemid,null_value,label
0,226557,0,R Ureteral Stent
1,226558,0,L Ureteral Stent
2,226559,0,Foley
3,226560,0,Void
4,226561,0,Condom Cath
5,226563,0,Suprapubic
6,226564,0,R Nephrostomy
7,226565,0,L Nephrostomy
8,226567,0,Straight Cath
9,226569,0,Anderson (gastric)


In [44]:
output2 = output.copy()

pd.reset_option('display.max_rows')

In [45]:
# Group by itemid
grouped = output2.groupby('itemid')

# Define the frequency function
def avg_obs_per_20_hours(group):
    n_obs = len(group)
    time_span_hours = (group['charttime'].max() - group['charttime'].min()).total_seconds() / 3600
    return (n_obs / time_span_hours * 20) if time_span_hours > 0 else float('nan')

# Apply the function to the cleaned dataframe
avg_freq_df = grouped.apply(avg_obs_per_20_hours).reset_index(name='avg_obs_per_20_hours')

avg_freq_df

  avg_freq_df = grouped.apply(avg_obs_per_20_hours).reset_index(name='avg_obs_per_20_hours')


Unnamed: 0,itemid,avg_obs_per_20_hours
0,226557,0.008882
1,226558,0.002945
2,226559,23.324210
3,226560,0.702340
4,226561,0.192176
...,...,...
66,227510,0.853695
67,227511,0.088838
68,227701,0.140177
69,229413,0.010207


In [46]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
avg_freq_df = avg_freq_df.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

avg_freq_df

Unnamed: 0,itemid,avg_obs_per_20_hours,label
0,226557,0.008882,R Ureteral Stent
1,226558,0.002945,L Ureteral Stent
2,226559,23.324210,Foley
3,226560,0.702340,Void
4,226561,0.192176,Condom Cath
...,...,...,...
66,227510,0.853695,TF Residual
67,227511,0.088838,TF Residual Output
68,227701,0.140177,Drainage Bag
69,229413,0.010207,Chest Tube #3


In [47]:
# Dropping features that are measured < 0.1 times per 20-hour period

# Identify low-frequency itemids
low_freq_itemids = avg_freq_df[avg_freq_df['avg_obs_per_20_hours'] < 0.1]['itemid'].tolist()

# Drop them from output2
output3 = output2[~output2['itemid'].isin(low_freq_itemids)].copy()

output3.shape

(1315294, 7)

In [48]:
output3 = output3[['subject_id', 'stay_id', 'itemid', 'charttime', 'value', 'label']]

output3.head()

Unnamed: 0,subject_id,stay_id,itemid,charttime,value,label
0,10002428,34807493,226559,2156-05-01 07:00:00,35.0,Foley
1,10002428,34807493,226559,2156-05-01 08:00:00,35.0,Foley
2,10002428,34807493,226559,2156-05-01 09:00:00,35.0,Foley
3,10002428,34807493,226559,2156-05-01 10:00:00,35.0,Foley
4,10002428,34807493,226559,2156-05-01 11:00:00,35.0,Foley


In [49]:
# Check for outliers in the dataframe

summary_stats = output3.groupby('itemid')['value'].agg(
    min_value='min',
    max_value='max',
    median_value='median'
).reset_index()

# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
summary_stats = summary_stats.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

print(summary_stats)

    itemid  min_value  max_value  median_value                    label
0   226559        0.0    25025.0          80.0                    Foley
1   226560        0.0     2600.0         250.0                     Void
2   226561        0.0     1900.0         175.0              Condom Cath
3   226572        0.0      700.0         250.0                    Ewald
4   226575        0.0    15000.0         100.0              Nasogastric
5   226576        0.0     3000.0         100.0             Oral Gastric
6   226579       -1.0     4000.0         200.0                    Stool
7   226580      -45.0     2000.0         300.0                Fecal Bag
8   226582        0.0     2000.0         150.0          Ostomy (output)
9   226588      -30.0     3000.0          25.0            Chest Tube #1
10  226589      -30.0     2500.0          20.0            Chest Tube #2
11  226590      -30.0     2160.0          20.0             L Pleural #1
12  226597        0.0     2300.0          50.0                JP

In [50]:
# Copy the original DataFrame
output4 = output3.copy()

# Define the reference ranges
valid_ranges = {
    226559: (0, 2000),
    226560: (0, 2000),
    226561: (0, 2000),
    226575: (0, 2000),
    226576: (0, 2000),
    226579: (0, 2000),
    226580: (0, 2000),
    226582: (0, 2000),
    226588: (0, 2000),
    226589: (0, 2000),
    226590: (0, 2000),
    226597: (0, 2000),
    226613: (0, 500),
    227510: (0, 500),
    227701: (0, 500),
}

# Separate into rows with itemids in valid_ranges and the rest
to_filter = output4[output4['itemid'].isin(valid_ranges.keys())].copy()
to_keep = output4[~output4['itemid'].isin(valid_ranges.keys())].copy()

# Apply filtering to to_filter
def in_valid_range(row):
    low, high = valid_ranges[row['itemid']]
    return low <= row['value'] <= high

filtered = to_filter[to_filter.apply(in_valid_range, axis=1)]

# Concatenate the filtered and unfiltered subsets
output4 = pd.concat([filtered, to_keep], ignore_index=True)

In [51]:
output4.head()

Unnamed: 0,subject_id,stay_id,itemid,charttime,value,label
0,10002428,34807493,226559,2156-05-01 07:00:00,35.0,Foley
1,10002428,34807493,226559,2156-05-01 08:00:00,35.0,Foley
2,10002428,34807493,226559,2156-05-01 09:00:00,35.0,Foley
3,10002428,34807493,226559,2156-05-01 10:00:00,35.0,Foley
4,10002428,34807493,226559,2156-05-01 11:00:00,35.0,Foley


In [52]:
output4.shape

(1314752, 6)

In [53]:
# Check for outliers in the dataframe

summary_stats = output4.groupby('itemid')['value'].agg(
    min_value='min',
    max_value='max',
    median_value='median'
).reset_index()

# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

# Merge the label column with null_counts using itemid
summary_stats = summary_stats.merge(
    d_items[['itemid', 'label']],  
    on='itemid',
    how='left' 
)

print(summary_stats)

    itemid  min_value  max_value  median_value                    label
0   226559        0.0     2000.0          80.0                    Foley
1   226560        0.0     2000.0         250.0                     Void
2   226561        0.0     1900.0         175.0              Condom Cath
3   226572        0.0      700.0         250.0                    Ewald
4   226575        0.0     2000.0         100.0              Nasogastric
5   226576        0.0     2000.0         100.0             Oral Gastric
6   226579        0.0     2000.0         200.0                    Stool
7   226580        0.0     2000.0         300.0                Fecal Bag
8   226582        0.0     2000.0         150.0          Ostomy (output)
9   226588        0.0     2000.0          25.0            Chest Tube #1
10  226589        0.0     2000.0          20.0            Chest Tube #2
11  226590        0.0     1900.0          20.0             L Pleural #1
12  226597        0.0     2000.0          50.0                JP

In [54]:
output5 = output4.copy()

In [55]:
output5.head()

Unnamed: 0,subject_id,stay_id,itemid,charttime,value,label
0,10002428,34807493,226559,2156-05-01 07:00:00,35.0,Foley
1,10002428,34807493,226559,2156-05-01 08:00:00,35.0,Foley
2,10002428,34807493,226559,2156-05-01 09:00:00,35.0,Foley
3,10002428,34807493,226559,2156-05-01 10:00:00,35.0,Foley
4,10002428,34807493,226559,2156-05-01 11:00:00,35.0,Foley


In [56]:
# Create dataframe which sums 24h output

# Make a copy of the dataframe
output6 = output5.copy()

# Extract the date part for grouping
output6['start_date'] = output6['charttime'].dt.date

# Group by subject_id, stay_id, and start_date
output6 = (
    output6
    .groupby(['subject_id', 'stay_id', 'start_date'])
    .agg(
        charttime=('charttime', 'min'),   # earliest time in the day
        value=('value', 'sum')            # total output in 24h
    )
    .reset_index()
    .rename(columns={'value': '24h_output'})
)

output6.head()

Unnamed: 0,subject_id,stay_id,start_date,charttime,24h_output
0,10002428,33987268,2156-04-12,2156-04-12 18:05:00,355.0
1,10002428,33987268,2156-04-13,2156-04-13 00:00:00,560.0
2,10002428,33987268,2156-04-14,2156-04-14 00:00:00,437.0
3,10002428,33987268,2156-04-15,2156-04-15 00:00:00,507.0
4,10002428,33987268,2156-04-16,2156-04-16 00:00:00,730.0


In [57]:
# Merge input6 and output6 on subject_id, stay_id, and start_date
fb_df = pd.merge(
    input6[['subject_id', 'stay_id', 'start_date', '24h_input']],
    output6[['subject_id', 'stay_id', 'start_date', '24h_output']],
    on=['subject_id', 'stay_id', 'start_date'],
    how='outer'
)

# Compute fluid balance: 24_input - 24_output
fb_df['24h_fb'] = fb_df['24h_input'].fillna(0) - fb_df['24h_output'].fillna(0)

fb_df.head()

Unnamed: 0,subject_id,stay_id,start_date,24h_input,24h_output,24h_fb
0,10002428,33987268,2156-04-12,2900.0,355.0,2545.0
1,10002428,33987268,2156-04-13,6440.0,560.0,5880.0
2,10002428,33987268,2156-04-14,6150.0,437.0,5713.0
3,10002428,33987268,2156-04-15,2040.0,507.0,1533.0
4,10002428,33987268,2156-04-16,800.0,730.0,70.0


In [59]:
# Add new columns with constant values
fb_df['itemid'] = 123456
fb_df['label'] = 'Fluid balance'

# Rename columns
fb_df = fb_df.rename(columns={
    '24h_fb': 'valuenum',
    'start_date': 'charttime'
})

# Drop unnecessary columns
fb_df = fb_df.drop(columns=['24h_input', '24h_output'])

In [60]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/postwean.csv'

# Load the CSV into a DataFrame
outcomes = pd.read_csv(file_path)

outcomes = outcomes[['subject_id', 'stay_id', 'ventilation_time', 'ext_success']]
outcomes = outcomes.drop_duplicates()
fb_df = pd.merge(
    fb_df,
    outcomes[['subject_id', 'ext_success']],
    on=['subject_id'],
    how='inner'  # 
)

In [61]:
fb_df.head()

Unnamed: 0,subject_id,stay_id,charttime,valuenum,itemid,label,ext_success
0,10002428,33987268,2156-04-12,2545.0,123456,Fluid balance,1
1,10002428,33987268,2156-04-13,5880.0,123456,Fluid balance,1
2,10002428,33987268,2156-04-14,5713.0,123456,Fluid balance,1
3,10002428,33987268,2156-04-15,1533.0,123456,Fluid balance,1
4,10002428,33987268,2156-04-16,70.0,123456,Fluid balance,1


In [64]:
import os

fb_df.to_csv("/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/fb_cleaned_combi.csv", index=False)

In [63]:
fb_df.shape

(99234, 7)

In [62]:
fb_df.isnull().sum()

subject_id     0
stay_id        0
charttime      0
valuenum       0
itemid         0
label          0
ext_success    0
dtype: int64

In [82]:
# Normalise each value in the 24h_fb column using MinMaxScalar

from sklearn.preprocessing import MinMaxScaler

# Apply MinMaxScaler to 24h_fb
scaler = MinMaxScaler()
fb_df["24hfb_normalized"] = scaler.fit_transform(fb_df[["24h_fb"]])

In [83]:
fb_df.head(50)

Unnamed: 0,subject_id,stay_id,start_date,24h_input,24h_output,24h_fb,24hfb_normalized
0,10002428,33987268,2156-04-12,2900.0,355.0,2545.0,0.205418
1,10002428,33987268,2156-04-13,6440.0,560.0,5880.0,0.252469
2,10002428,33987268,2156-04-14,6150.0,437.0,5713.0,0.250113
3,10002428,33987268,2156-04-15,2040.0,507.0,1533.0,0.19114
4,10002428,33987268,2156-04-16,800.0,730.0,70.0,0.170499
5,10002428,33987268,2156-04-17,400.0,990.0,-590.0,0.161188
6,10002428,34807493,2156-04-30,1000.0,,1000.0,0.18362
7,10002428,34807493,2156-05-01,2420.0,1138.0,1282.0,0.187599
8,10002428,34807493,2156-05-02,1350.0,1260.0,90.0,0.170782
9,10002428,35479615,2156-05-11,3960.0,625.0,3335.0,0.216563


In [84]:
import os

fb_df.to_csv("/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/fb_cleaned.csv", index=False)