# Development of machine learning models to process Electronic Health Records – Explainable Models

### Preprocessing Notebook
Lok Hang Toby Lee (2431180L)

# Formating
----------------------------------------------------------

In [1]:
# Imports:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as mc
import colorsys
import json

# SET YOUR PATH FOR RESOURCES FILE HERE
resources_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP\MIMIC-III-ML/resources"
data_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP\MIMIC-III-ML/data/raw"

In [2]:
events_data = pd.read_hdf(data_path + 'vitals_hourly_data.h5', 'X')
events_data = events_data.reset_index()
print('events_data (X): ', events_data.shape)

patients_data = pd.read_hdf(data_path + 'vitals_hourly_data.h5', 'patients_data')
print('patients_data: ', patients_data.shape)

outcomes = pd.read_hdf(data_path + 'vitals_hourly_data.h5', 'Y')
print('outcomes (Y): ', outcomes.shape)

# Load the config file that contains information about continuous/categorical variables:
config = json.load(open(resources_path + 'discretizer_config.json', 'r'))
is_categorical = config['is_categorical_channel']

# Get categorical variables:
categorical_var = []
continuous_var = []
for key, value in is_categorical.items():
    if value:
        categorical_var.append(key)
    else:
        continuous_var.append(key)
print('Categorical: ', categorical_var[1:])
print('Continuous: ', continuous_var)

categorical_var = categorical_var[1:]

events_data (X):  (20567026, 14)
patients_data:  (30063, 25)
outcomes (Y):  (30063, 2)
Categorical:  ['Glascow coma scale eye opening', 'Glascow coma scale motor response', 'Glascow coma scale total', 'Glascow coma scale verbal response']
Continuous:  ['Diastolic blood pressure', 'Fraction inspired oxygen', 'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight', 'pH']


In [3]:
events_data

Unnamed: 0,icustay_id,itemid,label,LEVEL1,LEVEL2,subject_id,hadm_id,value,valueuom,hours_in,dbsource,linksto,category,unitname
0,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,95.000000,%,2,carevue,chartevents,,
1,200003,677,Temperature C (calc),Temperature (C),Temperature,27513,163557,38.277802,Deg. C,2,carevue,chartevents,,
2,200003,678,Temperature F,Temperature (F),Temperature,27513,163557,100.900002,Deg. F,2,carevue,chartevents,,
3,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,94.000000,%,2,carevue,chartevents,,
4,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,92.000000,%,3,carevue,chartevents,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20567021,299999,50809,,Glucose,Glucose,7630,129161,159.000000,mg/dL,16,,,,
20567022,299999,50817,,Oxygen saturation,Oxygen saturation,7630,129161,98.000000,%,16,,,,
20567023,299999,50820,,pH,pH,7630,129161,7.380000,units,16,,,,
20567024,299999,50809,,Glucose,Glucose,7630,129161,103.000000,mg/dL,22,,,,


### Pre-processing steps

In [4]:
# Map variables to the same metric:
UNIT_CONVERSIONS = [
    ('weight',                   'oz',  None,             lambda x: x/16.*0.45359237),
    ('weight',                   'lbs', None,             lambda x: x*0.45359237),
    ('fraction inspired oxygen', None,  lambda x: x > 1,  lambda x: x/100.),
    ('oxygen saturation',        None,  lambda x: x <= 1, lambda x: x*100.),
    ('temperature',              'f',   lambda x: x > 79, lambda x: (x - 32) * 5./9),
    ('height',                   'in',  None,             lambda x: x*2.54),
]

variable_names = events_data['LEVEL1'].str
variable_units = events_data['valueuom'].str
for name, unit, check, convert_function in UNIT_CONVERSIONS:
    print(name, unit, check, convert_function)
    indices_variable = variable_names.contains(name, case=False, na=False)
    needs_conversion_filter_indices = indices_variable & False
    if unit is not None:
        print('found')
        needs_conversion_filter_indices |= variable_names.contains(unit, case=False, na=False) | variable_units.contains(unit, case=False, na=False)
    if check is not None:
        print('changed')
        needs_conversion_filter_indices |= check(events_data['value'])
    idx = indices_variable & needs_conversion_filter_indices
    events_data.loc[idx, 'value'] = convert_function(events_data['value'][idx])

weight oz None <function <lambda> at 0x00000220D39CE5E8>
found
weight lbs None <function <lambda> at 0x00000220D39CE678>
found
fraction inspired oxygen None <function <lambda> at 0x00000220D39CE708> <function <lambda> at 0x00000220D39CE798>
changed
oxygen saturation None <function <lambda> at 0x00000220D39CE828> <function <lambda> at 0x00000220D39CE8B8>
changed
temperature f <function <lambda> at 0x00000220D39CE948> <function <lambda> at 0x00000220D39CE9D8>
found
changed
height in None <function <lambda> at 0x00000220D39CEA68>
found


In [5]:
events_data

Unnamed: 0,icustay_id,itemid,label,LEVEL1,LEVEL2,subject_id,hadm_id,value,valueuom,hours_in,dbsource,linksto,category,unitname
0,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,95.000000,%,2,carevue,chartevents,,
1,200003,677,Temperature C (calc),Temperature (C),Temperature,27513,163557,38.277802,Deg. C,2,carevue,chartevents,,
2,200003,678,Temperature F,Temperature (F),Temperature,27513,163557,38.277779,Deg. F,2,carevue,chartevents,,
3,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,94.000000,%,2,carevue,chartevents,,
4,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,92.000000,%,3,carevue,chartevents,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20567021,299999,50809,,Glucose,Glucose,7630,129161,159.000000,mg/dL,16,,,,
20567022,299999,50817,,Oxygen saturation,Oxygen saturation,7630,129161,98.000000,%,16,,,,
20567023,299999,50820,,pH,pH,7630,129161,7.380000,units,16,,,,
20567024,299999,50809,,Glucose,Glucose,7630,129161,103.000000,mg/dL,22,,,,


### Removing outliers

In [6]:
# Detect and remove outliers. For this, they use two different outlier ranges: 
# 1) for each variable, they have an upper and lower threshold for detecting unusable outliers. 
#    If the outlier falls outside of these threshold, it is treated as missing. 
# 2) they also have a physiologically valid range of measurements. If the non-outlier falls outside this range, 
     # it is replaced with the nearest valid value.

variable_ranges = pd.read_csv(resources_path + 'variable_ranges.csv', index_col = None)
variable_ranges['LEVEL2'] = variable_ranges['LEVEL2'].str.lower()
variable_ranges = variable_ranges.set_index('LEVEL2')

variables_all = events_data['LEVEL2']
non_null_variables = ~events_data.value.isnull()
variables = set(variables_all)
range_names = set(variable_ranges.index.values)
range_names = [i.lower() for i in range_names]

for var_name in variables:
    var_name_lower = var_name.lower()
    
    if var_name_lower in range_names:
        out_low, out_high, val_low, val_high = [
            variable_ranges.loc[var_name_lower, x] for x in ('OUTLIER LOW', 'OUTLIER HIGH', 'VALID LOW', 'VALID HIGH')
        ]
        
        # First find the indices of the variables that we need to check for outliers:
        indices_variable = non_null_variables & (variables_all == var_name)
        
        # Check for low outliers and if they are not extreme, replace them with the imputation value:
        outlier_low_indices = (events_data.value < out_low)
        low_not_outliers = ~outlier_low_indices & (events_data.value < val_low)
        valid_low_indices = indices_variable & low_not_outliers
        events_data.loc[valid_low_indices, 'value'] = val_low
        
        # Check for high outliers and if they are not extreme, replace them with the imputation value:
        outlier_high_indices = (events_data.value > out_high)
        high_not_outliers = ~outlier_high_indices & (events_data.value > val_high)
        valid_high_indices = indices_variable & high_not_outliers
        events_data.loc[valid_high_indices, 'value'] = val_high
        
        # Treat values that are outside the outlier boundaries as missing:
        outlier_indices = indices_variable & (outlier_low_indices | outlier_high_indices)
        events_data.loc[outlier_indices, 'value'] = np.nan

In [7]:
variable_ranges

Unnamed: 0_level_0,LEVEL1,OUTLIER LOW,VALID LOW,IMPUTE,VALID HIGH,OUTLIER HIGH
LEVEL2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
alanine aminotransferase,,0.0,2.00,34.0,10000.00,11000.0
albumin,,0.0,0.60,3.1,6.00,60.0
alkaline phosphate,,0.0,20.00,106.0,3625.00,4000.0
anion gap,,0.0,5.00,13.0,50.00,55.0
asparate aminotransferase,,0.0,6.00,40.0,20000.00,22000.0
...,...,...,...,...,...,...
troponin-i,,0.0,0.01,2.3,49.60,575.0
troponin-t,,0.0,0.01,0.1,20.85,24.0
urine output,,0.0,0.00,80.0,1200.00,2445.0
weight,,0.0,0.00,81.8,250.00,250.0


In [8]:
events_data

Unnamed: 0,icustay_id,itemid,label,LEVEL1,LEVEL2,subject_id,hadm_id,value,valueuom,hours_in,dbsource,linksto,category,unitname
0,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,95.000000,%,2,carevue,chartevents,,
1,200003,677,Temperature C (calc),Temperature (C),Temperature,27513,163557,38.277802,Deg. C,2,carevue,chartevents,,
2,200003,678,Temperature F,Temperature (F),Temperature,27513,163557,38.277779,Deg. F,2,carevue,chartevents,,
3,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,94.000000,%,2,carevue,chartevents,,
4,200003,646,SpO2,Pulse oximetry,Oxygen saturation,27513,163557,92.000000,%,3,carevue,chartevents,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20567021,299999,50809,,Glucose,Glucose,7630,129161,159.000000,mg/dL,16,,,,
20567022,299999,50817,,Oxygen saturation,Oxygen saturation,7630,129161,98.000000,%,16,,,,
20567023,299999,50820,,pH,pH,7630,129161,7.380000,units,16,,,,
20567024,299999,50809,,Glucose,Glucose,7630,129161,103.000000,mg/dL,22,,,,


### Reshape data
We want to have a column for every variable:

In [9]:
events_data = events_data.set_index(['icustay_id', 'itemid', 'label', 'LEVEL1', 'LEVEL2'])
events_data = events_data.groupby(['icustay_id', 'subject_id', 'hadm_id', 'LEVEL2', 'hours_in'])
events_data = events_data.agg(['mean', 'std', 'count'])
events_data.columns = events_data.columns.droplevel(0)
events_data.columns.names = ['Aggregation Function']
events_data = events_data.unstack(level = 'LEVEL2')
events_data.columns = events_data.columns.reorder_levels(order=['LEVEL2', 'Aggregation Function'])

In [10]:
events_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,count,count,count,count,count,count,count,count,count,count
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
200003,27513,163557,0,49.000000,,4.0,6.0,15.0,5.0,110.0,119.0,,58.000000,...,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,
200003,27513,163557,1,52.000000,,,,,,,118.0,,59.000000,...,,4.0,,3.0,4.0,4.0,3.0,2.0,5.0,
200003,27513,163557,2,52.333333,,,,,,,116.0,,59.666667,...,,3.0,,3.0,3.0,3.0,3.0,2.0,3.0,
200003,27513,163557,3,60.500000,,,,,,,112.0,,65.500000,...,,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,
200003,27513,163557,4,61.000000,,,,,,,108.0,,67.000000,...,,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299999,7630,129161,25,46.000000,,4.0,6.0,15.0,5.0,,68.0,,64.000000,...,,1.0,,1.0,1.0,1.0,1.0,2.0,1.0,
299999,7630,129161,26,42.000000,,,,,,,71.0,,62.666698,...,,1.0,,1.0,1.0,1.0,1.0,,1.0,
299999,7630,129161,27,50.000000,,,,,,130.0,72.0,,68.333298,...,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,
299999,7630,129161,28,48.000000,,,,,,,70.0,,66.666702,...,,1.0,,1.0,1.0,1.0,1.0,,1.0,


In [11]:
events_data.columns

MultiIndex([(          'Diastolic blood pressure',  'mean'),
            (          'Fraction inspired oxygen',  'mean'),
            (    'Glascow coma scale eye opening',  'mean'),
            ( 'Glascow coma scale motor response',  'mean'),
            (          'Glascow coma scale total',  'mean'),
            ('Glascow coma scale verbal response',  'mean'),
            (                           'Glucose',  'mean'),
            (                        'Heart Rate',  'mean'),
            (                            'Height',  'mean'),
            (               'Mean blood pressure',  'mean'),
            (                 'Oxygen saturation',  'mean'),
            (                  'Respiratory rate',  'mean'),
            (           'Systolic blood pressure',  'mean'),
            (                       'Temperature',  'mean'),
            (                            'Weight',  'mean'),
            (                                'pH',  'mean'),
            (          '

In [12]:
# Make sure we have a row for every hour:
missing_hours_fill = pd.DataFrame([[i, x] for i, y in patients_data['max_hours'].iteritems() for x in range(y+1)],
                                 columns=[patients_data.index.names[0], 'hours_in'])
missing_hours_fill['tmp'] = np.NaN

fill_df = patients_data.reset_index()[['subject_id', 'hadm_id', 'icustay_id']].join(
     missing_hours_fill.set_index('icustay_id'), on='icustay_id')
fill_df.set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in'], inplace=True)

events_data = events_data.reindex(fill_df.index)
events_data = events_data.sort_index(axis = 0).sort_index(axis = 1)

idx = pd.IndexSlice
events_data.loc[:, idx[:, 'count']] = events_data.loc[:, idx[:, 'count']].fillna(0)

In [13]:
events_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,Diastolic blood pressure,Diastolic blood pressure,Diastolic blood pressure,Fraction inspired oxygen,Fraction inspired oxygen,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale eye opening,Glascow coma scale eye opening,Glascow coma scale motor response,...,Systolic blood pressure,Temperature,Temperature,Temperature,Weight,Weight,Weight,pH,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
200003,27513,163557,0,1.0,49.000000,,0.0,,,1.0,4.0,,1.0,...,,0.0,,,1.0,77.5,,0.0,,
200003,27513,163557,1,3.0,52.000000,11.357817,0.0,,,0.0,,,0.0,...,8.736895,2.0,38.999999,0.000001,5.0,77.5,0.0,0.0,,
200003,27513,163557,2,3.0,52.333333,9.237604,0.0,,,0.0,,,0.0,...,12.342339,2.0,38.277790,0.000016,3.0,77.5,0.0,0.0,,
200003,27513,163557,3,2.0,60.500000,13.435029,0.0,,,0.0,,,0.0,...,12.020815,2.0,37.777790,0.000017,2.0,77.5,0.0,0.0,,
200003,27513,163557,4,2.0,61.000000,0.000000,0.0,,,0.0,,,0.0,...,2.121320,2.0,36.833318,0.000024,2.0,77.5,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299999,7630,129161,27,1.0,50.000000,,0.0,,,0.0,,,0.0,...,,0.0,,,1.0,88.5,,0.0,,
299999,7630,129161,28,1.0,48.000000,,0.0,,,0.0,,,0.0,...,,0.0,,,1.0,88.5,,0.0,,
299999,7630,129161,29,1.0,49.000000,,0.0,,,1.0,4.0,,1.0,...,,0.0,,,1.0,88.5,,0.0,,
299999,7630,129161,30,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [14]:
# Save this version of the data as a .csv file, so we can apply different imputation methods in another notebook:
idx = pd.IndexSlice
timeseries_data = events_data.loc[:, idx[:, 'mean']]
timeseries_data = timeseries_data.droplevel('Aggregation Function', axis = 1) 
timeseries_data = timeseries_data.reset_index() 
timeseries_data.to_csv(data_path + 'mimic_timeseries_data_not_imputed.csv')

In [15]:
timeseries_data

LEVEL2,icustay_id,subject_id,hadm_id,hours_in,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,200003,27513,163557,0,49.000000,,4.0,6.0,15.0,5.0,110.0,119.0,,58.000000,97.0,35.000000,91.000000,,77.5,
1,200003,27513,163557,1,52.000000,,,,,,,118.0,,59.000000,96.0,32.000000,88.333333,38.999999,77.5,
2,200003,27513,163557,2,52.333333,,,,,,,116.0,,59.666667,95.0,30.333333,85.333333,38.277790,77.5,
3,200003,27513,163557,3,60.500000,,,,,,,112.0,,65.500000,93.5,32.500000,86.500000,37.777790,77.5,
4,200003,27513,163557,4,61.000000,,,,,,,108.0,,67.000000,91.5,37.000000,89.500000,36.833318,77.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2163074,299999,7630,129161,27,50.000000,,,,,,130.0,72.0,,68.333298,97.0,25.000000,105.000000,,88.5,
2163075,299999,7630,129161,28,48.000000,,,,,,,70.0,,66.666702,95.0,22.000000,104.000000,,88.5,
2163076,299999,7630,129161,29,49.000000,,4.0,6.0,15.0,5.0,,70.0,,67.666702,94.0,24.000000,105.000000,,88.5,
2163077,299999,7630,129161,30,,,,,,,,,,,,,,,,


### Imputation of time series data

In [16]:
idx = pd.IndexSlice
timeseries_data = events_data.loc[:, idx[:, ['mean', 'count']]]

# Get the mean across hours for each variable and each patient:
icustay_means = timeseries_data.loc[:, idx[:, 'mean']].groupby(['subject_id', 'hadm_id', 'icustay_id']).mean()

# Get the global mean for each variable:
global_means = timeseries_data.loc[:, idx[:, 'mean']].mean(axis = 0)

# Forward fill the nan time series, or otherwise fill in the patient's mean or global mean:
timeseries_data.loc[:, idx[:, 'mean']] = timeseries_data.loc[:, idx[:, 'mean']].groupby(
    ['subject_id', 'hadm_id', 'icustay_id']).fillna(method='ffill').groupby(
    ['subject_id', 'hadm_id', 'icustay_id']).fillna(icustay_means).fillna(global_means)

# Create a mask that indicates if the variable is present:
timeseries_data.loc[:, idx[:, 'count']] = (events_data.loc[:, idx[:, 'count']] > 0).astype(float)
timeseries_data.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)

# Add a variable that indicates the time since the last measurement to the dataframe:
is_absent = (1 - timeseries_data.loc[:, idx[:, 'mask']])
hours_of_absence = is_absent.cumsum()
time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)
timeseries_data = pd.concat((timeseries_data, time_since_measured), axis = 1)
timeseries_data.loc[:, idx[:, 'time_since_measured']] = timeseries_data.loc[:, idx[:, 'time_since_measured']].fillna(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
timeseries_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
200003,27513,163557,0,49.000000,0.532158,4.0,6.0,15.0,5.0,110.0,119.0,168.964694,58.000000,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0
200003,27513,163557,1,52.000000,0.532158,4.0,6.0,15.0,5.0,110.0,118.0,168.964694,59.000000,...,1.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
200003,27513,163557,2,52.333333,0.532158,4.0,6.0,15.0,5.0,110.0,116.0,168.964694,59.666667,...,2.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
200003,27513,163557,3,60.500000,0.532158,4.0,6.0,15.0,5.0,110.0,112.0,168.964694,65.500000,...,3.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
200003,27513,163557,4,61.000000,0.532158,4.0,6.0,15.0,5.0,110.0,108.0,168.964694,67.000000,...,4.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299999,7630,129161,27,50.000000,0.532158,4.0,6.0,15.0,5.0,130.0,72.0,168.964694,68.333298,...,0.0,0.0,764.0,0.0,0.0,0.0,0.0,2.0,0.0,11.0
299999,7630,129161,28,48.000000,0.532158,4.0,6.0,15.0,5.0,130.0,70.0,168.964694,66.666702,...,1.0,0.0,765.0,0.0,0.0,0.0,0.0,3.0,0.0,12.0
299999,7630,129161,29,49.000000,0.532158,4.0,6.0,15.0,5.0,130.0,70.0,168.964694,67.666702,...,2.0,0.0,766.0,0.0,0.0,0.0,0.0,4.0,0.0,13.0
299999,7630,129161,30,49.000000,0.532158,4.0,6.0,15.0,5.0,130.0,70.0,168.964694,67.666702,...,3.0,1.0,767.0,1.0,1.0,1.0,1.0,5.0,1.0,14.0


### Standardization of continuous data

In [18]:
# Minmax standardization:
def minmax(x):
    mins = x.min()
    maxes = x.max()
    x_std = (x - mins) / (maxes - mins)
    return x_std

def std_time_since_measurement(x):
    idx = pd.IndexSlice
    x = np.where(x==100, 0, x)
    means = x.mean()
    stds = x.std() + 0.0001
    x_std = (x - means)/stds
    return x_std

timeseries_data.loc[:, idx[continuous_var, 'mean']] = timeseries_data.loc[:, idx[continuous_var, 'mean']].apply(lambda x: minmax(x))
timeseries_data.loc[:, idx[:, 'time_since_measured']] = timeseries_data.loc[:, idx[:, 'time_since_measured']].apply(lambda x: std_time_since_measurement(x))

In [19]:
timeseries_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured,time_since_measured
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
200003,27513,163557,0,0.159609,0.407795,4.0,6.0,15.0,5.0,0.049422,0.396667,0.70402,0.139241,...,-0.581027,-0.126197,-0.999068,-0.137486,-0.147118,-0.137439,-0.133019,-0.313701,-0.681936,-0.730732
200003,27513,163557,1,0.169381,0.407795,4.0,6.0,15.0,5.0,0.049422,0.393333,0.70402,0.142405,...,-0.456971,-0.126197,-0.999068,-0.137486,-0.147118,-0.137439,-0.133019,-0.313701,-0.681936,-0.730732
200003,27513,163557,2,0.170467,0.407795,4.0,6.0,15.0,5.0,0.049422,0.386667,0.70402,0.144515,...,-0.332915,-0.126197,-0.999068,-0.137486,-0.147118,-0.137439,-0.133019,-0.313701,-0.681936,-0.730732
200003,27513,163557,3,0.197068,0.407795,4.0,6.0,15.0,5.0,0.049422,0.373333,0.70402,0.162975,...,-0.208860,-0.126197,-0.999068,-0.137486,-0.147118,-0.137439,-0.133019,-0.313701,-0.681936,-0.730732
200003,27513,163557,4,0.198697,0.407795,4.0,6.0,15.0,5.0,0.049422,0.360000,0.70402,0.167722,...,-0.084804,-0.126197,-0.999068,-0.137486,-0.147118,-0.137439,-0.133019,-0.313701,-0.681936,-0.730732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299999,7630,129161,27,0.162866,0.407795,4.0,6.0,15.0,5.0,0.062259,0.240000,0.70402,0.171941,...,-0.581027,-0.126197,1.713125,-0.137486,-0.147118,-0.137439,-0.133019,-0.118660,-0.681936,-0.499197
299999,7630,129161,28,0.156352,0.407795,4.0,6.0,15.0,5.0,0.062259,0.233333,0.70402,0.166667,...,-0.456971,-0.126197,1.716675,-0.137486,-0.147118,-0.137439,-0.133019,-0.021139,-0.681936,-0.478149
299999,7630,129161,29,0.159609,0.407795,4.0,6.0,15.0,5.0,0.062259,0.233333,0.70402,0.169831,...,-0.332915,-0.126197,1.720225,-0.137486,-0.147118,-0.137439,-0.133019,0.076381,-0.681936,-0.457100
299999,7630,129161,30,0.159609,0.407795,4.0,6.0,15.0,5.0,0.062259,0.233333,0.70402,0.169831,...,-0.208860,-0.011447,1.723775,-0.024036,-0.012814,-0.024831,-0.018720,0.173902,-0.650062,-0.436052


### One-hot encoding categorical variables

In [20]:
# First we need to round the categorical variables to the nearest category:
categorical_data = timeseries_data.loc[:, idx[categorical_var, 'mean']].copy(deep=True)
categorical_data = categorical_data.round()
one_hot = pd.get_dummies(categorical_data, columns=categorical_var)

# Clean up the columns that we do not need and add the dummy encodings:
for c in categorical_var:
    if c in timeseries_data.columns:
        timeseries_data.drop(c, axis = 1, inplace=True)
timeseries_data.columns = timeseries_data.columns.droplevel(-1)
timeseries_data = pd.merge(timeseries_data.reset_index(), one_hot.reset_index(), how='inner', left_on=['subject_id', 'icustay_id', 'hadm_id', 'hours_in'],
                           right_on=['subject_id', 'icustay_id', 'hadm_id', 'hours_in'])
timeseries_data = timeseries_data.set_index(['subject_id', 'icustay_id', 'hadm_id', 'hours_in'])

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [21]:
timeseries_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,...,"('Glascow coma scale total', 'mean')_11.0","('Glascow coma scale total', 'mean')_12.0","('Glascow coma scale total', 'mean')_13.0","('Glascow coma scale total', 'mean')_14.0","('Glascow coma scale total', 'mean')_15.0","('Glascow coma scale verbal response', 'mean')_1.0","('Glascow coma scale verbal response', 'mean')_2.0","('Glascow coma scale verbal response', 'mean')_3.0","('Glascow coma scale verbal response', 'mean')_4.0","('Glascow coma scale verbal response', 'mean')_5.0"
subject_id,icustay_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
27513,200003,163557,0,0.159609,0.407795,0.049422,0.396667,0.70402,0.139241,0.970,0.116667,0.292605,0.710426,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,1,0.169381,0.407795,0.049422,0.393333,0.70402,0.142405,0.960,0.106667,0.284030,0.801370,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,2,0.170467,0.407795,0.049422,0.386667,0.70402,0.144515,0.950,0.101111,0.274384,0.756851,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,3,0.197068,0.407795,0.049422,0.373333,0.70402,0.162975,0.935,0.108333,0.278135,0.726029,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,4,0.198697,0.407795,0.049422,0.360000,0.70402,0.167722,0.915,0.123333,0.287781,0.667808,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7630,299999,129161,27,0.162866,0.407795,0.062259,0.240000,0.70402,0.171941,0.970,0.083333,0.337621,0.705479,...,0,0,0,0,1,0,0,0,0,1
7630,299999,129161,28,0.156352,0.407795,0.062259,0.233333,0.70402,0.166667,0.950,0.073333,0.334405,0.705479,...,0,0,0,0,1,0,0,0,0,1
7630,299999,129161,29,0.159609,0.407795,0.062259,0.233333,0.70402,0.169831,0.940,0.080000,0.337621,0.705479,...,0,0,0,0,1,0,0,0,0,1
7630,299999,129161,30,0.159609,0.407795,0.062259,0.233333,0.70402,0.169831,0.940,0.080000,0.337621,0.705479,...,0,0,0,0,1,0,0,0,0,1


### Preprocessing of Y / outcomes

In [22]:
# First get the number of nan values per variable:
print(outcomes.isna().sum())

# We will replace them with zero:
outcomes = outcomes.fillna(0)

in_hospital_mortality    0
los                      0
dtype: int64


In [23]:
outcomes

Unnamed: 0_level_0,in_hospital_mortality,los
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1
211552,0,145.5504
294638,0,40.2840
228232,0,88.1496
220597,1,127.7544
229441,0,38.0256
...,...,...
286606,0,24.9576
226241,0,75.4224
242052,0,47.3880
229633,0,51.8760


### Save all pre-processed data

In [24]:
# Rename the columns and save the results:
s = timeseries_data.columns.to_series()
timeseries_data.columns = s + s.groupby(s).cumcount().astype(str).replace({'0':''})

timeseries_data.to_hdf(data_path + 'vitals_hourly_data_preprocessed.h5', 'X')
outcomes.to_hdf(data_path + 'vitals_hourly_data_preprocessed.h5', 'Y')

In [25]:
timeseries_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,...,"('Glascow coma scale total', 'mean')_11.0","('Glascow coma scale total', 'mean')_12.0","('Glascow coma scale total', 'mean')_13.0","('Glascow coma scale total', 'mean')_14.0","('Glascow coma scale total', 'mean')_15.0","('Glascow coma scale verbal response', 'mean')_1.0","('Glascow coma scale verbal response', 'mean')_2.0","('Glascow coma scale verbal response', 'mean')_3.0","('Glascow coma scale verbal response', 'mean')_4.0","('Glascow coma scale verbal response', 'mean')_5.0"
subject_id,icustay_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
27513,200003,163557,0,0.159609,0.407795,0.049422,0.396667,0.70402,0.139241,0.970,0.116667,0.292605,0.710426,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,1,0.169381,0.407795,0.049422,0.393333,0.70402,0.142405,0.960,0.106667,0.284030,0.801370,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,2,0.170467,0.407795,0.049422,0.386667,0.70402,0.144515,0.950,0.101111,0.274384,0.756851,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,3,0.197068,0.407795,0.049422,0.373333,0.70402,0.162975,0.935,0.108333,0.278135,0.726029,...,0,0,0,0,1,0,0,0,0,1
27513,200003,163557,4,0.198697,0.407795,0.049422,0.360000,0.70402,0.167722,0.915,0.123333,0.287781,0.667808,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7630,299999,129161,27,0.162866,0.407795,0.062259,0.240000,0.70402,0.171941,0.970,0.083333,0.337621,0.705479,...,0,0,0,0,1,0,0,0,0,1
7630,299999,129161,28,0.156352,0.407795,0.062259,0.233333,0.70402,0.166667,0.950,0.073333,0.334405,0.705479,...,0,0,0,0,1,0,0,0,0,1
7630,299999,129161,29,0.159609,0.407795,0.062259,0.233333,0.70402,0.169831,0.940,0.080000,0.337621,0.705479,...,0,0,0,0,1,0,0,0,0,1
7630,299999,129161,30,0.159609,0.407795,0.062259,0.233333,0.70402,0.169831,0.940,0.080000,0.337621,0.705479,...,0,0,0,0,1,0,0,0,0,1
