# Development of machine learning models to process Electronic Health Records – Explainable Models

### Extraction Notebook
Lok Hang Toby Lee (2431180L)

# Data Extraction
---------------------------------------------------

### Configuration Step

In [11]:
# Imports:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as mc
import colorsys
import psycopg2
import os
import yaml
%matplotlib inline


#pg_ctl.exe restart -D "E:\PostgreSQL\data"

# Configuration:
sqluser = 'postgres'
dbname = 'mimic'
password='postgres'
schema_name = 'public, mimic, mimiciii;'

# Connect to MIMIC-III:
con = psycopg2.connect(dbname=dbname, user=sqluser, password=password)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)


# SET YOUR PATH FOR RESOURCES FILE HERE
resources_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP/MIMIC-III-ML/data/resources/"
data_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP/MIMIC-III-ML/data/raw/"

### 2.1 Study cohort selection
- Only first ICU admissions that took at least a day and less than 10 days
- Adult patients only (age >= 15)

In [12]:
# Settings for the query:
min_age = 15
limit_population = 0 # if we want to run the query for a small number of patients (for debugging)
if limit_population > 0:
    limit = 'LIMIT ' + str(limit_population)
else:
    limit = ''
    
query = """
with patient_and_icustay_details as (
    SELECT distinct
        p.gender, p.dob, p.dod, s.*, a.admittime, a.dischtime, a.deathtime, a.ethnicity, a.diagnosis,
        DENSE_RANK() OVER (PARTITION BY a.subject_id ORDER BY a.admittime) AS hospstay_seq,
        DENSE_RANK() OVER (PARTITION BY s.hadm_id ORDER BY s.intime) AS icustay_seq,
        DATE_PART('year', s.intime) - DATE_PART('year', p.dob) as admission_age,
        DATE_PART('day', s.outtime - s.intime) as los_icu
    FROM patients p 
        INNER JOIN icustays s ON p.subject_id = s.subject_id
        INNER JOIN admissions a ON s.hadm_id = a.hadm_id 
    WHERE s.first_careunit NOT like 'NICU'
        and s.hadm_id is not null and s.icustay_id is not null
        and (s.outtime >= (s.intime + interval '12 hours'))
        and (s.outtime <= (s.intime + interval '240 hours'))
    ORDER BY s.subject_id 
)
SELECT * 
FROM patient_and_icustay_details 
WHERE hospstay_seq = 1
    and icustay_seq = 1
    and admission_age >=  """ + str(min_age) + """
    and los_icu >= 0.5
""" + str(limit)
patients_data = pd.read_sql_query('SET search_path to ' + schema_name + query, con)

# Save result:
#patients_data.to_csv('static_data.csv')

In [13]:
patients_data

Unnamed: 0,gender,dob,dod,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,...,los,admittime,dischtime,deathtime,ethnicity,diagnosis,hospstay_seq,icustay_seq,admission_age,los_icu
0,M,2025-04-11,2102-06-14,2,3,145834,211552,carevue,MICU,MICU,...,6.0646,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,WHITE,HYPOTENSION,1,1,76.0,6.0
1,F,2143-05-12,NaT,3,4,185777,294638,carevue,MICU,MICU,...,1.6785,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",1,1,48.0,1.0
2,F,2109-06-21,NaT,5,6,107064,228232,carevue,SICU,SICU,...,3.6729,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,WHITE,CHRONIC RENAL FAILURE/SDA,1,1,66.0,3.0
3,M,2108-01-26,2149-11-14,9,9,150750,220597,carevue,MICU,MICU,...,5.3231,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,1,1,41.0,5.0
4,F,2128-02-22,2178-11-14,11,11,194540,229441,carevue,SICU,SICU,...,1.5844,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,WHITE,BRAIN MASS,1,1,50.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30058,M,2114-09-29,NaT,61527,99983,117390,286606,metavision,CCU,CCU,...,1.0399,2193-04-26 11:35:00,2193-04-29 13:30:00,NaT,UNKNOWN/NOT SPECIFIED,ST ELEVATION MYOCARDIAL INFARCTION;CORONARY AR...,1,1,79.0,1.0
30059,M,2137-04-07,NaT,61529,99991,151118,226241,metavision,TSICU,TSICU,...,3.1426,2184-12-24 08:30:00,2185-01-05 12:15:00,NaT,WHITE,DIVERTICULITIS/SDA,1,1,47.0,3.0
30060,F,2078-10-17,NaT,61530,99992,197084,242052,metavision,MICU,MICU,...,1.9745,2144-07-25 18:03:00,2144-07-28 17:56:00,NaT,WHITE,RETROPERITONEAL HEMORRHAGE,1,1,66.0,1.0
30061,F,2058-05-29,2147-09-29,61531,99995,137810,229633,metavision,CSRU,CSRU,...,2.1615,2147-02-08 08:00:00,2147-02-11 13:15:00,NaT,WHITE,ABDOMINAL AORTIC ANEURYSM/SDA,1,1,89.0,2.0


### 2.2	Extraction of vital data and mapping to variables

In [14]:
variables_to_keep = ('Capillary refill rate', 'Diastolic blood pressure', 'Fraction inspired oxygen', 
                     'Glascow coma scale eye opening', 'Glascow coma scale motor response', 'Glascow coma scale total',
                     'Glascow coma scale verbal response', 'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure',
                     'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight', 'pH')


# variables_to_keep = ('Alanine aminotransferase', 'Albumin', 'Albumin ascites',
#        'Albumin pleural', 'Albumin urine', 'Alkaline phosphate',
#        'Anion gap', 'Asparate aminotransferase', 'Basophils',
#        'Bicarbonate', 'Bicarbonate urine', 'Bilirubin', 'Blood culture',
#        'Blood urea nitrogen', 'Calcium', 'Calcium ionized',
#        'Calcium Ionized', 'Calcium urine', 'Capillary refill rate',
#        'Chloride', 'Chloride urine', 'Cholesterol', 'Cholesterol HDL',
#        'Cholesterol LDL', 'Cholesterol Pleural',
#        'CO2 (ETCO2, PCO2, etc.)', 'CO2', 'Creatinine',
#        'Creatinine ascites', 'Creatinine body fluid',
#        'Creatinine pleural', 'Creatinine urine',
#        'Diastolic blood pressure', 'Eosinophils',
#        'Fraction inspired oxygen', 'Fraction inspired oxygen Set',
#        'Glascow coma scale eye opening',
#        'Glascow coma scale motor response', 'Glascow coma scale total',
#        'Glascow coma scale verbal response', 'Glucose', 'Glucose urine',
#        'Heart Rate', 'Height', 'Hematocrit', 'Hemoglobin',
#        'Hemoglobin percent', 'Lactate', 'Lactate dehydrogenase',
#        'Lactate dehydrogenase pleural', 'Lactic acid', 'Lymphocytes',
#        'Lymphocytes ascites', 'Lymphocytes pleural',
#        'Lymphocytes body fluid', 'Lymphocytes atypical',
#        'Lymphocytes atypical CSL', 'Lymphocytes atypical ascites',
#        'Lymphocytes atypical pleural', 'Lymphocytes percent', 'Magnesium',
#        'Mean blood pressure', 'Mean corpuscular hemoglobin',
#        'Mean corpuscular hemoglobin concentration',
#        'Mean corpuscular volume', 'Monocytes', 'Monocytes CSL',
#        'Neutrophils', 'Oxygen saturation',
#        'Partial pressure of carbon dioxide', 'Partial pressure of oxygen',
#        'Partial thromboplastin time', 'Peak inspiratory pressure', 'pH',
#        'pH urine', 'Phosphate', 'Platelets',
#        'Positive end-expiratory pressure',
#        'Positive end-expiratory pressure Set', 'Potassium',
#        'Potassium serum', 'Prothrombin time INR', 'Prothrombin time PT',
#        'Pupillary response left', 'Pupillary response right',
#        'Pupillary size left', 'Pupillary size right',
#        'Red blood cell count', 'Red blood cell count urine',
#        'Red blood cell count ascites', 'Red blood cell count CSF',
#        'Red blood cell count pleural', 'Respiratory rate',
#        'Respiratory rate Set', 'Sodium', 'Systolic blood pressure',
#        'Temperature', 'Troponin-I', 'Troponin-T', 'Urine Appearance',
#        'Urine Color', 'Urine output', 'Weight', 'White blood cell count',
#        'White blood cell count urine', 'Central Venous Pressure',
#        'Pulmonary Artery Pressure mean',
#        'Pulmonary Artery Pressure systolic', 'Cardiac Index',
#        'Systemic Vascular Resistance', 'Cardiac Output Thermodilution',
#        'Cardiac Output fick', 'Pulmonary Capillary Wedge Pressure',
#        'Post Void Residual', 'Cardiac Murmur', 'Vitamin K', 'Phosphorous',
#        'Fibrinogen', 'Total Protein', 'Total Protein Body Fluid',
#        'Total Protein Joint Fluid', 'Total Protein Urine',
#        'Arterial Base Excess', 'Venous PvO2', 'Plateau Pressure',
#        'Tidal Volume Observed', 'Tidal Volume Set',
#        'Tidal Volume Spontaneous', 'Lung Sounds', 'Heart Rhythm',
#        'Ectopy Type', 'Ectopy Frequency', 'Code Status', 'Fall Risk',
#        'Orientation', 'Consciousness Level', 'Riker-SAS Scale',
#        'Ventilator Type', 'Ventilator Mode', 'Pacemaker', 'Trach Size',
#        'Skin Color', 'Skin Integrity', 'Service Type')

var_map = pd.read_csv(resources_path + '/itemid_to_variable_map.csv')

In [15]:
var_map

Unnamed: 0,LEVEL2,LEVEL1,ALTERNATIVE,STATUS,STATUS NOTE,ITEMID,MIMIC LABEL,UNITNAME,LINKSTO,COUNT,CATEGORY,CONCEPTID,FLUID,LOINC_CODE,DBSOURCE,Unnamed: 15,PARAM_TYPE,NOTE
0,Alanine aminotransferase,Alanine aminotransferase,ALT,ready,,50861,ALANINE AMINOTRANSFERASE (ALT),,labevents,219475.0,CHEMISTRY,,BLOOD,,,,,
1,Alanine aminotransferase,Alanine aminotransferase,ALT,ready,,769,ALT,,chartevents,41594.0,Enzymes,,,,carevue,,,
2,Alanine aminotransferase,Alanine aminotransferase,ALT,ready,,220644,ALT,,chartevents,37625.0,Labs,,,,metavision,,Numeric,
3,Albumin,Albumin,,ready,,50862,ALBUMIN,,labevents,146697.0,CHEMISTRY,,BLOOD,1751-7,,,,
4,Albumin,Albumin,,ready,,772,Albumin (>3.2),,chartevents,31022.0,Chemistry,,,,carevue,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13246,Urine output,Urine output,,ready,,226565,L Nephrostomy,,outputevents,,,,,,metavision,,,
13247,Urine output,Urine output,,ready,,226567,Straight Cath,,outputevents,,,,,,metavision,,,
13248,Urine output,Urine output,,ready,,226584,Ileoconduit,,outputevents,,,,,,metavision,,,
13249,Urine output,Urine output,OR Urine,ready,,226627,OR Urine,mL,outputevents,5928.0,Output,,,,metavision,,,


In [16]:
icu_ids_to_keep = patients_data['icustay_id']
icu_ids_to_keep = tuple(set([str(i) for i in icu_ids_to_keep]))
subjects_to_keep = patients_data['subject_id']
subjects_to_keep = tuple(set([str(i) for i in subjects_to_keep]))
hadms_to_keep = patients_data['hadm_id']
hadms_to_keep = tuple(set([str(i) for i in hadms_to_keep]))

labitems_to_keep = []
chartitems_to_keep = []
for i in range(var_map.shape[0]):
    if var_map['LEVEL2'][i] in variables_to_keep:
        if var_map['LINKSTO'][i] == 'chartevents':
            chartitems_to_keep.append(var_map['ITEMID'][i])
        elif var_map['LINKSTO'][i] == 'labevents':
            labitems_to_keep.append(var_map['ITEMID'][i])
            
all_to_keep = chartitems_to_keep + labitems_to_keep
var_map = var_map[var_map.ITEMID.isin(all_to_keep)]
chartitems_to_keep = tuple(set([str(i) for i in chartitems_to_keep]))
labitems_to_keep = tuple(set([str(i) for i in labitems_to_keep]))

In [17]:
query = """
SELECT c.subject_id, i.hadm_id, c.icustay_id, c.charttime, c.itemid, c.value, c.valueuom
FROM icustays i
INNER JOIN chartevents c ON i.icustay_id = c.icustay_id
where c.icustay_id in """ + str(icu_ids_to_keep) + """
  and c.itemid in """ + str(chartitems_to_keep) + """
  and c.charttime between intime and outtime
  and c.error is distinct from 1
  and c.valuenum is not null
UNION ALL
SELECT distinct i.subject_id, i.hadm_id, i.icustay_id, l.charttime, l.itemid, l.value, l.valueuom
FROM icustays i
INNER JOIN labevents l ON i.hadm_id = l.hadm_id
where i.icustay_id in """ + str(icu_ids_to_keep) + """
  and l.itemid in """ + str(labitems_to_keep) + """
  and l.charttime between (intime - interval '6' hour) and outtime
  and l.valuenum > 0 -- lab values cannot be 0 and cannot be negative
"""
events_data = pd.read_sql_query('SET search_path to ' + schema_name + query, con)
events_data.to_csv(data_path + 'events_data.csv')

In [18]:
itemids = tuple(set(events_data.itemid.astype(str)))
events_data

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom
0,345,169339,260258,2169-05-22 01:45:00,220228,12.4,g/dl
1,345,169339,260258,2169-05-22 08:03:00,220235,40,mmHg
2,266,186251,293876,2168-07-11 14:00:00,220179,132,mmHg
3,266,186251,293876,2168-07-11 14:00:00,220180,78,mmHg
4,266,186251,293876,2168-07-11 14:00:00,220181,89,mmHg
...,...,...,...,...,...,...,...
30921685,99999,113369,246512,2118-01-01 05:35:00,51249,33.6,%
30921686,99999,113369,246512,2118-01-01 05:35:00,51250,92,fL
30921687,99999,113369,246512,2118-01-01 05:35:00,51265,182,K/uL
30921688,99999,113369,246512,2118-01-01 05:35:00,51279,2.81,m/uL


In [19]:
query_d_items = \
        """
        SELECT itemid, label, dbsource, linksto, category, unitname
        FROM d_items
        WHERE itemid in """ + str(itemids)
d_output = pd.read_sql_query('SET search_path to ' + schema_name + query_d_items, con)

In [20]:
d_output

Unnamed: 0,itemid,label,dbsource,linksto,category,unitname
0,51,Arterial BP [Systolic],carevue,chartevents,,
1,52,Arterial BP Mean,carevue,chartevents,,
2,89,C.O. (fick),carevue,chartevents,,
3,90,C.O.(thermodilution),carevue,chartevents,,
4,113,CVP,carevue,chartevents,,
...,...,...,...,...,...,...
200,227465,Prothrombin time,metavision,chartevents,Labs,
201,227466,PTT,metavision,chartevents,Labs,
202,227467,INR,metavision,chartevents,Labs,
203,227468,Fibrinogen,metavision,chartevents,Labs,


In [21]:
# Remove the text from the categorical (Glasgow coma scale) variables so we can make them numeric:
replacement_dictionary = {'4 Spontaneously': '4', '3 To speech': '3', '2 To pain': '2', '1 No Response': '1',
                         '5 Oriented': '5', '1.0 ET/Trach': '1', '4 Confused': '4', '2 Incomp sounds': '2', 
                         '3 Inapprop words': '3', 'Spontaneously': '4', 'To Speech': '3', 'None': '1', 'To Pain': '2',
                         '6 Obeys Commands': '6', '5 Localizes Pain': '5', '4 Flex-withdraws': '4', '2 Abnorm extensn': '2',
                         '3 Abnorm flexion': '3', 'No Response-ETT': '1', 'Oriented': '5', 'Confused': '4', 
                         'No Response': '1', 'Incomprehensible sounds': '2', 'Inappropriate Words': '3', 
                         'Obeys Commands': '6', 'No response': '1', 'Localizes Pain': '5', 'Flex-withdraws': '4',
                         'Abnormal extension': '2', 'Abnormal flexion': '3', 'Abnormal Flexion': '3', 
                          'Abnormal Extension': '2'}
for key, value in replacement_dictionary.items():
    events_data['value'] = events_data['value'].replace(key, value) 

In [22]:
# Change data types and set indices:
events_data['value'] = pd.to_numeric(events_data['value']) #, 'coerce')
events_data = events_data.astype({k: int for k in ['subject_id', 'hadm_id', 'icustay_id']})
patients_data = patients_data.reset_index().set_index('icustay_id')
var_map = var_map[['LEVEL2', 'ITEMID', 'LEVEL1']].rename(
    {'LEVEL2': 'LEVEL2', 'LEVEL1': 'LEVEL1', 'ITEMID': 'itemid'}, axis=1).set_index('itemid')

# Change to hourly data:
to_hours = lambda x: max(0, x.days*24 + x.seconds // 3600)
events_data = events_data.set_index('icustay_id').join(patients_data[['intime']])
events_data['hours_in'] = (events_data['charttime'] - events_data['intime']).apply(to_hours)
events_data = events_data.drop(columns=['charttime', 'intime']) 

# Join with d_output query and group variables:
events_data = events_data.set_index('itemid', append=True)
events_data = events_data.join(var_map)
d_output = d_output.set_index('itemid')
events_data = events_data.join(d_output) 
events_data = events_data.set_index(['label', 'LEVEL1', 'LEVEL2'], append=True)
patients_data['max_hours'] = (patients_data['outtime'] - patients_data['intime']).apply(to_hours)

In [23]:
events_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,subject_id,hadm_id,value,valueuom,hours_in,dbsource,linksto,category,unitname
icustay_id,itemid,label,LEVEL1,LEVEL2,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
200003,184,Eye Opening,Glascow coma scale eye opening,Glascow coma scale eye opening,27513,163557,4.00,,127,carevue,chartevents,,
200003,198,GCS Total,Glascow coma scale total,Glascow coma scale total,27513,163557,15.00,points,127,carevue,chartevents,,
200003,184,Eye Opening,Glascow coma scale eye opening,Glascow coma scale eye opening,27513,163557,2.00,,31,carevue,chartevents,,
200003,190,FiO2 Set,Fraction inspired oxygen Set,Fraction inspired oxygen Set,27513,163557,0.40,torr,31,carevue,chartevents,,
200003,198,GCS Total,Glascow coma scale total,Glascow coma scale total,27513,163557,7.00,points,31,carevue,chartevents,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
299999,51279,,Red blood cell count,Red blood cell count,7630,129161,3.87,m/uL,21,,,,
299999,51301,,White blood cell count,White blood cell count,7630,129161,15.90,K/uL,21,,,,
299999,50809,,Glucose,Glucose,7630,129161,103.00,mg/dL,22,,,,
299999,50817,,Oxygen saturation,Oxygen saturation,7630,129161,95.00,%,22,,,,


In [24]:
patients_data

Unnamed: 0_level_0,index,gender,dob,dod,row_id,subject_id,hadm_id,dbsource,first_careunit,last_careunit,...,admittime,dischtime,deathtime,ethnicity,diagnosis,hospstay_seq,icustay_seq,admission_age,los_icu,max_hours
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
211552,0,M,2025-04-11,2102-06-14,2,3,145834,carevue,MICU,MICU,...,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,WHITE,HYPOTENSION,1,1,76.0,6.0,145
294638,1,F,2143-05-12,NaT,3,4,185777,carevue,MICU,MICU,...,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",1,1,48.0,1.0,40
228232,2,F,2109-06-21,NaT,5,6,107064,carevue,SICU,SICU,...,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,WHITE,CHRONIC RENAL FAILURE/SDA,1,1,66.0,3.0,88
220597,3,M,2108-01-26,2149-11-14,9,9,150750,carevue,MICU,MICU,...,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,1,1,41.0,5.0,127
229441,4,F,2128-02-22,2178-11-14,11,11,194540,carevue,SICU,SICU,...,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,WHITE,BRAIN MASS,1,1,50.0,1.0,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286606,30058,M,2114-09-29,NaT,61527,99983,117390,metavision,CCU,CCU,...,2193-04-26 11:35:00,2193-04-29 13:30:00,NaT,UNKNOWN/NOT SPECIFIED,ST ELEVATION MYOCARDIAL INFARCTION;CORONARY AR...,1,1,79.0,1.0,24
226241,30059,M,2137-04-07,NaT,61529,99991,151118,metavision,TSICU,TSICU,...,2184-12-24 08:30:00,2185-01-05 12:15:00,NaT,WHITE,DIVERTICULITIS/SDA,1,1,47.0,3.0,75
242052,30060,F,2078-10-17,NaT,61530,99992,197084,metavision,MICU,MICU,...,2144-07-25 18:03:00,2144-07-28 17:56:00,NaT,WHITE,RETROPERITONEAL HEMORRHAGE,1,1,66.0,1.0,47
229633,30061,F,2058-05-29,2147-09-29,61531,99995,137810,metavision,CSRU,CSRU,...,2147-02-08 08:00:00,2147-02-11 13:15:00,NaT,WHITE,ABDOMINAL AORTIC ANEURYSM/SDA,1,1,89.0,2.0,51


In [25]:
# Save results:
patients_data.to_hdf(data_path + 'vitals_hourly_data_v2.h5', 'patients_data')
events_data.to_hdf(data_path + 'vitals_hourly_data_v2.h5', 'X')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['gender', 'dbsource', 'first_careunit', 'last_careunit', 'ethnicity',
       'diagnosis'],
      dtype='object')]

  encoding=encoding,
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['valueuom', 'dbsource', 'linksto', 'category', 'unitname'], dtype='object')]

  encoding=encoding,


### Extract length of stay and in-hospital mortality

In [26]:
outcomes = pd.DataFrame(index=patients_data.index)
# In hospital mortality: patient has died after the admittime to hospital and before the outtime:
mortality = patients_data.dod.notnull() & ((patients_data.admittime <= patients_data.dod) & (patients_data.outtime >= patients_data.dod))
mortality = mortality | (patients_data.deathtime.notnull() & ((patients_data.admittime <= patients_data.deathtime) & 
                                                             (patients_data.dischtime >= patients_data.deathtime)))
outcomes['in_hospital_mortality'] = mortality.astype(int)

# Length of stay (in hours):
outcomes['los'] = patients_data['los'] * 24.0
outcomes.to_hdf(data_path + 'vitals_hourly_data_v2.h5', 'Y')

In [19]:
outcomes

Unnamed: 0_level_0,in_hospital_mortality,los
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1
211552,0,145.5504
294638,0,40.2840
228232,0,88.1496
220597,1,127.7544
229441,0,38.0256
...,...,...
286606,0,24.9576
226241,0,75.4224
242052,0,47.3880
229633,0,51.8760


In [27]:
#Save to csv
events_data.to_csv(data_path + 'events_data_v2.csv')
patients_data.to_csv(data_path + 'patients_data_v2.csv')
outcomes.to_csv(data_path + 'outcomes_v2.csv')

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------