In [1]:
import pandas as pd
from itertools import islice
import numpy as np
import json
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit

import seaborn as sn
sn.set_theme(style="white", palette="rocket_r")

In [2]:
# Process the time-series
def timeseries_processing(timeseries, labels, hour):
    timeseries.reset_index(level=1, inplace=True)
    timeseries.time = pd.to_timedelta(timeseries.time, errors='coerce')
    
    # Convert the time-stamps into minutes
    timeseries.time = timeseries.time.astype(int)/(1000000000*60)
        
    # Add time of event
    timeseries = timeseries.merge(labels, left_index=True, right_index=True)
    
    # Only keep those timestamps with more than time before the event for prediction
    timeseries = timeseries[timeseries['time'] <= (timeseries['actualiculos'] - hour*60)]
    
    # Drop the labels column, we will add them at the end for consistency and to avoid redundancy
    timeseries = timeseries.drop(columns=['actualiculos', 'actualhospitalmortality'])
    
    timeseries.reset_index(inplace=True)
    timeseries.set_index(['patient', 'time'], inplace=True)
    
    # Get the means and standard deviations of the timeseries
    df_mean = pd.concat({'mean': timeseries.groupby(level=0, axis=0).mean()}, axis=1).swaplevel(axis=1)
    df_mean.columns = df_mean.columns.droplevel(level = 1)
    df_mean = df_mean.add_suffix('_mean')

    df_std = pd.concat({'std': timeseries.groupby(level=0, axis=0).std()}, axis=1).swaplevel(axis=1)
    df_std.columns = df_std.columns.droplevel(level = 1)
    df_std = df_std.add_suffix('_std')

    timeseries = df_mean.merge(df_std, left_index=True, right_index=True)
    
    return timeseries

In [3]:
labels = pd.read_csv('preprocessed_labels.csv', index_col='patient')
labels['actualiculos'] = labels['actualiculos']*24*60

In [4]:
labels

Unnamed: 0_level_0,uniquepid,patienthealthsystemstayid,actualhospitalmortality,actualiculos
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30000153,12466550,23998182,0,2357.166667
30000213,13180007,27543152,0,2354.800000
30000484,18421337,22413411,0,3569.600000
30000646,12207593,22795209,1,6764.433333
30001148,12980335,23552849,0,1634.583333
...,...,...,...,...
39999301,16180713,20178444,1,1651.550000
39999384,15498623,27161460,0,1837.100000
39999552,11256534,26910235,0,1681.366667
39999562,15403458,25335698,0,7194.666667


# Diagnosis

In [6]:
diagnosis = pd.read_csv('diagnosis.csv')

In [7]:
diagnosis

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,10000032,32952584,1,4589,9,HYPOTENSION NOS
1,10000032,32952584,2,07070,9,UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...
2,10000032,32952584,3,V08,9,ASYMPTOMATIC HIV INFECTION
3,10000032,33258284,1,5728,9,"OTH SEQUELA, CHR LIV DIS"
4,10000032,33258284,2,78959,9,OTHER ASCITES
...,...,...,...,...,...,...
946687,19999828,30712109,1,K632,10,Fistula of intestine
946688,19999828,32917002,1,E1110,10,Type 2 diabetes mellitus with ketoacidosis wit...
946689,19999828,32917002,2,Z7984,10,Long term (current) use of oral hypoglycemic d...
946690,19999914,32002659,1,R4182,10,"Altered mental status, unspecified"


In [8]:
edstays = pd.read_csv('edstays.csv')

In [9]:
edstays

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,AMBULANCE,ADMITTED
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,AMBULANCE,ADMITTED
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,AMBULANCE,HOME
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,AMBULANCE,ADMITTED
...,...,...,...,...,...,...,...,...,...
447707,19999784,26194817.0,35692999,2119-06-18 14:21:00,2119-06-18 21:09:29,M,BLACK/AFRICAN AMERICAN,WALK IN,ADMITTED
447708,19999828,25744818.0,32917002,2149-01-08 09:11:00,2149-01-08 18:12:00,F,WHITE,AMBULANCE,ADMITTED
447709,19999828,29734428.0,30712109,2147-07-17 17:18:00,2147-07-18 17:34:00,F,WHITE,WALK IN,ADMITTED
447710,19999914,,32002659,2158-12-24 11:41:00,2158-12-24 11:56:00,F,UNKNOWN,UNKNOWN,ELOPED


In [10]:
ED = pd.merge(diagnosis, edstays, left_on=['subject_id', 'stay_id'], right_on=['subject_id', 'stay_id'], how='right')

In [11]:
ED

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title,hadm_id,intime,outtime,gender,race,arrival_transport,disposition
0,10000032,33258284,1.0,5728,9.0,"OTH SEQUELA, CHR LIV DIS",22595853.0,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
1,10000032,33258284,2.0,78959,9.0,OTHER ASCITES,22595853.0,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
2,10000032,33258284,3.0,07070,9.0,UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...,22595853.0,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
3,10000032,33258284,4.0,V08,9.0,ASYMPTOMATIC HIV INFECTION,22595853.0,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
4,10000032,38112554,1.0,78959,9.0,OTHER ASCITES,22841357.0,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,AMBULANCE,ADMITTED
...,...,...,...,...,...,...,...,...,...,...,...,...,...
947869,19999828,32917002,1.0,E1110,10.0,Type 2 diabetes mellitus with ketoacidosis wit...,25744818.0,2149-01-08 09:11:00,2149-01-08 18:12:00,F,WHITE,AMBULANCE,ADMITTED
947870,19999828,32917002,2.0,Z7984,10.0,Long term (current) use of oral hypoglycemic d...,25744818.0,2149-01-08 09:11:00,2149-01-08 18:12:00,F,WHITE,AMBULANCE,ADMITTED
947871,19999828,30712109,1.0,K632,10.0,Fistula of intestine,29734428.0,2147-07-17 17:18:00,2147-07-18 17:34:00,F,WHITE,WALK IN,ADMITTED
947872,19999914,32002659,1.0,R4182,10.0,"Altered mental status, unspecified",,2158-12-24 11:41:00,2158-12-24 11:56:00,F,UNKNOWN,UNKNOWN,ELOPED


In [12]:
searchfor = ['myocardial', 'coronary artery', 'acute coronary', 'PTCA', 'coronary syndrome', 'infarction', 'non-Q']
ED_MI = ED[ED.icd_title.str.contains('|'.join(searchfor), na=False)]

In [13]:
labels['patient'] = labels.index
ED_linked = pd.merge(ED_MI, labels, left_on=['subject_id', 'hadm_id'], right_on=['uniquepid', 'patienthealthsystemstayid'], how='left')

In [14]:
ED_linked

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title,hadm_id,intime,outtime,gender,race,arrival_transport,disposition,uniquepid,patienthealthsystemstayid,actualhospitalmortality,actualiculos,patient
0,10001667,33673933,1.0,I639,10.0,"Cerebral infarction, unspecified",22672901.0,2173-08-22 01:40:00,2173-08-22 17:17:16,F,WHITE,AMBULANCE,ADMITTED,,,,,
1,10001884,34226385,3.0,I2510,10.0,Athscl heart disease of native coronary artery...,28669374.0,2130-11-19 13:41:00,2130-11-20 10:46:00,F,BLACK/AFRICAN AMERICAN,WALK IN,HOME,,,,,
2,10003299,32908139,2.0,I639,10.0,"Cerebral infarction, unspecified",29323205.0,2181-10-22 11:46:00,2181-10-22 19:09:15,F,BLACK/AFRICAN AMERICAN,AMBULANCE,ADMITTED,,,,,
3,10004606,34549994,3.0,I2510,10.0,Athscl heart disease of native coronary artery...,,2159-05-17 19:15:00,2159-05-17 23:41:00,F,WHITE,AMBULANCE,HOME,,,,,
4,10005464,33455046,2.0,I2510,10.0,Athscl heart disease of native coronary artery...,,2165-06-28 02:17:00,2165-06-28 09:31:00,M,WHITE - RUSSIAN,AMBULANCE,HOME,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,19988997,39070828,1.0,I214,10.0,Non-ST elevation (NSTEMI) myocardial infarction,29807937.0,2174-06-27 14:13:00,2174-06-27 15:23:00,M,UNKNOWN,AMBULANCE,ADMITTED,,,,,
4799,19989783,32849404,5.0,I214,10.0,Non-ST elevation (NSTEMI) myocardial infarction,24282820.0,2130-08-02 12:52:00,2130-08-02 17:28:05,M,BLACK/AFRICAN AMERICAN,AMBULANCE,ADMITTED,19989783.0,24282820.0,0.0,6987.1,32711376.0
4800,19990106,39207830,3.0,I2510,10.0,Athscl heart disease of native coronary artery...,25163479.0,2166-04-06 16:58:00,2166-04-07 15:33:00,M,BLACK/AFRICAN AMERICAN,AMBULANCE,HOME,,,,,
4801,19993089,39865345,1.0,I214,10.0,Non-ST elevation (NSTEMI) myocardial infarction,20556903.0,2156-09-10 18:29:00,2156-09-10 21:43:58,F,WHITE,AMBULANCE,ADMITTED,,,,,


In [15]:
ED_linked['actualhospitalmortality'].isna().sum()/4803, ED_linked['actualiculos'].isna().sum()/4803

(0.6953987091401208, 0.6953987091401208)

In [16]:
# Remove those with missing hospital discharge status or length of stay
ED_linked = ED_linked[ED_linked['actualhospitalmortality'].notna()]

In [17]:
columns_we_care_about = ['patient', 'actualhospitalmortality', 'actualiculos']
ED_linked = ED_linked[columns_we_care_about]

In [18]:
ED_linked = ED_linked.set_index('patient')

In [19]:
ED_linked

Unnamed: 0_level_0,actualhospitalmortality,actualiculos
patient,Unnamed: 1_level_1,Unnamed: 2_level_1
32506122.0,0.0,2913.066667
34992648.0,0.0,8349.433333
33768181.0,0.0,1074.250000
32769810.0,0.0,9067.533333
31203589.0,0.0,1579.316667
...,...,...
31123584.0,0.0,28613.483333
38791957.0,0.0,3897.983333
34859288.0,0.0,2816.683333
32711376.0,0.0,6987.100000


# Timeseries

In [None]:
timeseries = pd.read_csv('preprocessed_timeseries.csv', index_col=['patient', 'time'])

In [9]:
timeseries

Unnamed: 0_level_0,Unnamed: 1_level_0,Activity / Mobility (JH-HLM),Apnea Interval,Arterial Blood Pressure Alarm - High,Arterial Blood Pressure Alarm - Low,Arterial Blood Pressure diastolic,Arterial Blood Pressure mean,Arterial Blood Pressure systolic,Chloride (serum),Creatinine (serum),Current Dyspnea Assessment,...,Strength R Arm,Strength R Leg,Temperature Fahrenheit,Tidal Volume (observed),Tidal Volume (set),Tidal Volume (spontaneous),Total PEEP Level,Ventilator Mode,Vti High,Braden Score
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
30000153,-1 days +23:51:00,,,,,,,,,,,...,,,,,,,5.0,,,
30000153,-1 days +23:52:00,,20.0,,,,,,,,,...,,,,492.0,500.0,,,49.0,2.04,
30000153,-1 days +23:56:00,,,,,,,,,,,...,,,,,,,,,,
30000153,-1 days +23:57:00,,,,,,,,,,,...,,,,,,,,,,
30000153,-1 days +23:59:00,,,,,,,,,,,...,,,96.8,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999810,4 days 13:24:00,,,,,,,,,,,...,,,,,,,,,,
39999810,4 days 15:23:00,2.0,,,,,,,,,,...,5.0,5.0,,,,,,,,
39999810,4 days 15:24:00,,,,,,,,,,,...,,,,,,,,,,
39999810,4 days 15:26:00,,,,,,,,,,,...,,,,,,,,,,


In [10]:
timeseries_summary = timeseries_processing(timeseries, labels, 0)

In [11]:
timeseries_summary

Unnamed: 0_level_0,Activity / Mobility (JH-HLM)_mean,Apnea Interval_mean,Arterial Blood Pressure Alarm - High_mean,Arterial Blood Pressure Alarm - Low_mean,Arterial Blood Pressure diastolic_mean,Arterial Blood Pressure mean_mean,Arterial Blood Pressure systolic_mean,Chloride (serum)_mean,Creatinine (serum)_mean,Current Dyspnea Assessment_mean,...,Temperature Fahrenheit_std,Tidal Volume (observed)_std,Tidal Volume (set)_std,Tidal Volume (spontaneous)_std,Total PEEP Level_std,Ventilator Mode_std,Vti High_std,Braden Score_std,uniquepid_std,patienthealthsystemstayid_std
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000153,,20.0,156.666667,96.666667,66.935484,90.300000,137.387097,115.000000,1.000000,,...,0.950438,78.542982,0.000000,,0.000000,21.939310,0.0,1.356203,0.0,0.0
30000213,3.500000,40.0,,,,,,100.666667,3.666667,0.0,...,0.636896,107.637951,162.388423,25.979158,2.903618,,0.0,1.032796,0.0,0.0
30000484,,,,,,,,105.000000,1.233333,,...,0.907538,,,,,,,1.363442,0.0,0.0
30000646,,,,,,,,110.571429,0.700000,,...,0.949464,,,,,,,2.286737,0.0,0.0
30001148,,20.0,106.666667,70.000000,58.833333,73.541667,108.291667,109.500000,0.600000,,...,1.864314,289.206674,,,,26.870058,0.0,2.857738,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999301,,,150.000000,90.000000,,,,108.500000,1.650000,,...,0.479236,,,,,,,1.500000,0.0,0.0
39999384,2.500000,,100.000000,80.000000,49.857143,87.642857,167.642857,103.000000,2.200000,0.0,...,0.335676,,,,,,,1.732051,0.0,0.0
39999552,,20.0,143.333333,90.000000,62.888889,83.185185,120.407407,105.666667,0.700000,,...,,267.664591,28.867513,,,27.135463,0.0,0.000000,0.0,0.0
39999562,2.333333,,,,,,,102.500000,0.850000,0.0,...,0.451422,,,,,,,2.368778,0.0,0.0


In [12]:
# Drop row that has all NaN values
timeseries_summary=timeseries_summary.dropna(how='all')

In [13]:
timeseries_summary

Unnamed: 0_level_0,Activity / Mobility (JH-HLM)_mean,Apnea Interval_mean,Arterial Blood Pressure Alarm - High_mean,Arterial Blood Pressure Alarm - Low_mean,Arterial Blood Pressure diastolic_mean,Arterial Blood Pressure mean_mean,Arterial Blood Pressure systolic_mean,Chloride (serum)_mean,Creatinine (serum)_mean,Current Dyspnea Assessment_mean,...,Temperature Fahrenheit_std,Tidal Volume (observed)_std,Tidal Volume (set)_std,Tidal Volume (spontaneous)_std,Total PEEP Level_std,Ventilator Mode_std,Vti High_std,Braden Score_std,uniquepid_std,patienthealthsystemstayid_std
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000153,,20.0,156.666667,96.666667,66.935484,90.300000,137.387097,115.000000,1.000000,,...,0.950438,78.542982,0.000000,,0.000000,21.939310,0.0,1.356203,0.0,0.0
30000213,3.500000,40.0,,,,,,100.666667,3.666667,0.0,...,0.636896,107.637951,162.388423,25.979158,2.903618,,0.0,1.032796,0.0,0.0
30000484,,,,,,,,105.000000,1.233333,,...,0.907538,,,,,,,1.363442,0.0,0.0
30000646,,,,,,,,110.571429,0.700000,,...,0.949464,,,,,,,2.286737,0.0,0.0
30001148,,20.0,106.666667,70.000000,58.833333,73.541667,108.291667,109.500000,0.600000,,...,1.864314,289.206674,,,,26.870058,0.0,2.857738,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999301,,,150.000000,90.000000,,,,108.500000,1.650000,,...,0.479236,,,,,,,1.500000,0.0,0.0
39999384,2.500000,,100.000000,80.000000,49.857143,87.642857,167.642857,103.000000,2.200000,0.0,...,0.335676,,,,,,,1.732051,0.0,0.0
39999552,,20.0,143.333333,90.000000,62.888889,83.185185,120.407407,105.666667,0.700000,,...,,267.664591,28.867513,,,27.135463,0.0,0.000000,0.0,0.0
39999562,2.333333,,,,,,,102.500000,0.850000,0.0,...,0.451422,,,,,,,2.368778,0.0,0.0


# Lab Measurements

In [14]:
timeseries_lab = pd.read_csv('preprocessed_timeseries_lab.csv', index_col=['patient', 'time'])

In [15]:
timeseries_lab_summary = timeseries_processing(timeseries_lab, labels, 0)

In [16]:
timeseries_lab_summary

Unnamed: 0_level_0,Alanine Aminotransferase (ALT)_mean,Alkaline Phosphatase_mean,Anion Gap_mean,Asparate Aminotransferase (AST)_mean,Base Excess_mean,Bicarbonate_mean,"Bilirubin, Total_mean","Calcium, Total_mean",Calculated Total CO2_mean,Chloride_mean,...,Sodium_std,"Sodium, Whole Blood_std",Temperature_std,Urea Nitrogen_std,White Blood Cells_std,pCO2_std,pH_std,pO2_std,uniquepid_std,patienthealthsystemstayid_std
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000153,15.000000,,12.000000,62.000000,-3.333333,21.000000,,7.700000,22.666667,115.000000,...,2.121320,0.707107,,0.000000,1.272792,1.732051,0.005774,26.153394,0.0,0.0
30000213,12.000000,97.000000,15.666667,19.000000,0.500000,23.333333,0.300000,8.333333,27.000000,100.666667,...,0.577350,,,0.577350,,4.949747,1.100409,4.949747,0.0,0.0
30000484,12.333333,75.333333,10.000000,34.333333,1.000000,27.666667,0.333333,8.133333,33.000000,105.000000,...,2.000000,,,5.507571,3.137409,,0.510523,,0.0,0.0
30000646,22.000000,97.000000,12.000000,19.000000,-1.000000,22.125000,0.700000,7.625000,21.000000,109.500000,...,1.846812,,,1.927248,0.988144,,0.028284,,0.0,0.0
30001148,9.000000,94.000000,10.500000,15.000000,2.444444,27.666667,0.500000,,28.111111,106.666667,...,0.707107,2.081666,,2.081666,2.458997,5.622376,0.147124,92.030037,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999301,13.000000,59.000000,13.500000,18.000000,-2.666667,23.500000,0.400000,8.800000,22.666667,108.500000,...,0.000000,,,3.535534,0.850490,5.507571,0.045092,143.238728,0.0,0.0
39999384,,,16.000000,,,24.000000,,8.600000,,103.000000,...,,,,,0.424264,,,,0.0,0.0
39999552,,,11.500000,,3.500000,26.000000,,8.800000,27.166667,105.666667,...,2.081666,1.527525,,0.000000,0.750555,2.786874,0.293762,118.390315,0.0,0.0
39999562,,,11.333333,,,24.333333,,8.500000,,103.666667,...,3.055050,,,0.577350,0.351188,,,,0.0,0.0


# Static Features

In [17]:
flats = pd.read_csv('preprocessed_flat.csv', index_col='patient')

In [18]:
flats = flats.drop(['nullheight'], axis=1)

In [34]:
flats

Unnamed: 0_level_0,gender,age,height,weight,hour,eyes,motor,verbal,ethnicity_BLACK/AFRICAN AMERICAN,ethnicity_OTHER,...,admission_location_EMERGENCY ROOM,admission_location_PHYSICIAN REFERRAL,admission_location_PROCEDURE SITE,admission_location_TRANSFER FROM HOSPITAL,admission_location_TRANSFER FROM SKILLED NURSING FACILITY,admission_location_WALK-IN/SELF REFERRAL,admission_location_misc,insurance_Medicaid,insurance_Medicare,insurance_Other
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000153,1,61,0.0,70.000000,12,3.666667,5.933333,3.800000,0,0,...,1,0,0,0,0,0,0,0,0,1
30000213,1,66,160.0,84.700000,5,3.666667,5.666667,3.000000,0,0,...,0,1,0,0,0,0,0,0,1,0
30000484,1,92,163.0,68.500000,17,3.866667,4.933333,3.066667,0,0,...,1,0,0,0,0,0,0,0,1,0
30000646,1,44,0.0,4.000000,1,6.000000,5.000000,170.000000,0,0,...,1,0,0,0,0,0,0,1,0,0
30001148,1,68,183.0,65.700000,11,3.200000,5.000000,4.200000,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999301,1,77,170.0,107.700000,15,4.000000,6.000000,5.000000,1,0,...,1,0,0,0,0,0,0,0,0,1
39999384,0,81,0.0,67.000000,19,4.000000,6.000000,5.000000,0,0,...,0,1,0,0,0,0,0,0,1,0
39999552,0,72,173.0,64.500000,14,3.000000,4.750000,4.000000,0,0,...,0,1,0,0,0,0,0,0,0,1
39999562,1,71,0.0,62.000000,17,4.000000,6.000000,5.000000,0,0,...,0,1,0,0,0,0,0,0,1,0


In [35]:
labels.merge(timeseries_lab_summary, left_index=True, right_index=True)

Unnamed: 0_level_0,uniquepid,patienthealthsystemstayid,actualhospitalmortality,actualiculos,patient,Alanine Aminotransferase (ALT)_mean,Alkaline Phosphatase_mean,Anion Gap_mean,Asparate Aminotransferase (AST)_mean,Base Excess_mean,...,Sodium_std,"Sodium, Whole Blood_std",Temperature_std,Urea Nitrogen_std,White Blood Cells_std,pCO2_std,pH_std,pO2_std,uniquepid_std,patienthealthsystemstayid_std
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000153,12466550,23998182,0,2357.166667,30000153,15.000000,,12.000000,62.000000,-3.333333,...,2.121320,0.707107,,0.000000,1.272792,1.732051,0.005774,26.153394,0.0,0.0
30000213,13180007,27543152,0,2354.800000,30000213,12.000000,97.000000,15.666667,19.000000,0.500000,...,0.577350,,,0.577350,,4.949747,1.100409,4.949747,0.0,0.0
30000484,18421337,22413411,0,3569.600000,30000484,12.333333,75.333333,10.000000,34.333333,1.000000,...,2.000000,,,5.507571,3.137409,,0.510523,,0.0,0.0
30000646,12207593,22795209,1,6764.433333,30000646,22.000000,97.000000,12.000000,19.000000,-1.000000,...,1.846812,,,1.927248,0.988144,,0.028284,,0.0,0.0
30001148,12980335,23552849,0,1634.583333,30001148,9.000000,94.000000,10.500000,15.000000,2.444444,...,0.707107,2.081666,,2.081666,2.458997,5.622376,0.147124,92.030037,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999301,16180713,20178444,1,1651.550000,39999301,13.000000,59.000000,13.500000,18.000000,-2.666667,...,0.000000,,,3.535534,0.850490,5.507571,0.045092,143.238728,0.0,0.0
39999384,15498623,27161460,0,1837.100000,39999384,,,16.000000,,,...,,,,,0.424264,,,,0.0,0.0
39999552,11256534,26910235,0,1681.366667,39999552,,,11.500000,,3.500000,...,2.081666,1.527525,,0.000000,0.750555,2.786874,0.293762,118.390315,0.0,0.0
39999562,15403458,25335698,0,7194.666667,39999562,,,11.333333,,,...,3.055050,,,0.577350,0.351188,,,,0.0,0.0


In [36]:
labels.merge(timeseries_summary, left_index=True, right_index=True)

Unnamed: 0_level_0,uniquepid,patienthealthsystemstayid,actualhospitalmortality,actualiculos,patient,Activity / Mobility (JH-HLM)_mean,Apnea Interval_mean,Arterial Blood Pressure Alarm - High_mean,Arterial Blood Pressure Alarm - Low_mean,Arterial Blood Pressure diastolic_mean,...,Temperature Fahrenheit_std,Tidal Volume (observed)_std,Tidal Volume (set)_std,Tidal Volume (spontaneous)_std,Total PEEP Level_std,Ventilator Mode_std,Vti High_std,Braden Score_std,uniquepid_std,patienthealthsystemstayid_std
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000153,12466550,23998182,0,2357.166667,30000153,,20.0,156.666667,96.666667,66.935484,...,0.950438,78.542982,0.000000,,0.000000,21.939310,0.0,1.356203,0.0,0.0
30000213,13180007,27543152,0,2354.800000,30000213,3.500000,40.0,,,,...,0.636896,107.637951,162.388423,25.979158,2.903618,,0.0,1.032796,0.0,0.0
30000484,18421337,22413411,0,3569.600000,30000484,,,,,,...,0.907538,,,,,,,1.363442,0.0,0.0
30000646,12207593,22795209,1,6764.433333,30000646,,,,,,...,0.949464,,,,,,,2.286737,0.0,0.0
30001148,12980335,23552849,0,1634.583333,30001148,,20.0,106.666667,70.000000,58.833333,...,1.864314,289.206674,,,,26.870058,0.0,2.857738,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999301,16180713,20178444,1,1651.550000,39999301,,,150.000000,90.000000,,...,0.479236,,,,,,,1.500000,0.0,0.0
39999384,15498623,27161460,0,1837.100000,39999384,2.500000,,100.000000,80.000000,49.857143,...,0.335676,,,,,,,1.732051,0.0,0.0
39999552,11256534,26910235,0,1681.366667,39999552,,20.0,143.333333,90.000000,62.888889,...,,267.664591,28.867513,,,27.135463,0.0,0.000000,0.0,0.0
39999562,15403458,25335698,0,7194.666667,39999562,2.333333,,,,,...,0.451422,,,,,,,2.368778,0.0,0.0


# For Death Prediction

In [33]:
final = ED_linked.merge(flats, left_index=True, right_index=True)
print(final.shape[0])
final = final.merge(timeseries_lab_summary, left_index=True, right_index=True)
print(final.shape[0])
final = final.merge(timeseries_summary, left_index=True, right_index=True)
print(final.shape[0])

1463
1433
1433


In [33]:
final
final = final.drop(columns=['uniquepid_mean_x', 'patienthealthsystemstayid_mean_x', 'uniquepid_std_x', 'patienthealthsystemstayid_std_x',
                           'uniquepid_mean_y', 'patienthealthsystemstayid_mean_y', 'uniquepid_std_y', 'patienthealthsystemstayid_std_y'])
final = final.dropna(thresh=final.shape[0]*0.4,how='all',axis=1)

In [34]:
final

Unnamed: 0_level_0,actualhospitalmortality,actualiculos,gender,age,height,weight,hour,eyes,motor,verbal,...,Respiratory Rate_std,Richmond-RAS Scale_std,Secondary diagnosis_std,Sodium (serum)_std,Strength L Arm_std,Strength L Leg_std,Strength R Arm_std,Strength R Leg_std,Temperature Fahrenheit_std,Braden Score_std
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30009596.0,0.0,2140.566667,0,77,0.0,72.0,7,4.000000,6.000000,5.000000,...,2.752621,0.000000,,,0.000000,0.000000,0.000000,0.000000,0.070711,
30016794.0,0.0,2996.666667,1,70,0.0,62.9,18,3.800000,5.933333,5.000000,...,5.188127,0.000000,0.0,0.577350,0.510418,0.000000,0.000000,0.000000,0.288675,
30018045.0,0.0,6942.716667,1,57,188.0,102.5,21,4.000000,6.000000,5.000000,...,4.541570,0.510754,0.0,3.361547,0.000000,0.000000,0.000000,0.000000,0.420178,1.290994
30030569.0,0.0,2345.000000,0,91,0.0,55.9,22,3.280000,6.000000,4.280000,...,5.631755,0.447214,,,0.403113,0.000000,0.341565,0.000000,1.005982,0.707107
30032853.0,0.0,3162.150000,1,55,188.0,125.0,12,3.625000,5.375000,4.500000,...,4.182711,1.802776,0.0,1.861899,0.516398,0.516398,0.516398,0.516398,0.309839,2.886751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39964005.0,1.0,3371.050000,0,68,165.0,76.5,1,1.428571,4.357143,1.000000,...,4.858897,1.164965,0.0,11.430952,0.000000,0.000000,0.000000,0.000000,0.684769,0.516398
39965733.0,0.0,17183.550000,1,75,167.0,64.1,20,3.660714,5.875000,2.928571,...,5.175042,0.760279,0.0,5.910484,0.429415,0.753244,0.429415,0.753244,0.424300,1.627882
39971339.0,0.0,7596.250000,0,83,157.0,63.0,10,3.769231,5.615385,4.384615,...,3.103539,1.165287,0.0,2.439750,0.408248,0.688737,0.408248,0.688737,0.618166,1.804036
39977970.0,1.0,2716.716667,1,91,0.0,75.8,21,3.666667,4.916667,1.083333,...,6.443721,0.000000,0.0,,1.133893,1.133893,0.899735,0.786796,0.230217,1.414214


In [37]:
final['actualhospitalmortality'].value_counts()

0.0    1006
1.0     137
Name: actualhospitalmortality, dtype: int64

In [33]:
list(final.columns)

['actualhospitalmortality',
 'actualiculos',
 'gender',
 'age',
 'height',
 'weight',
 'hour',
 'eyes',
 'motor',
 'verbal',
 'ethnicity_BLACK/AFRICAN AMERICAN',
 'ethnicity_OTHER',
 'ethnicity_UNKNOWN',
 'ethnicity_WHITE',
 'ethnicity_WHITE - OTHER EUROPEAN',
 'ethnicity_misc',
 'first_careunit_Cardiac Vascular Intensive Care Unit (CVICU)',
 'first_careunit_Coronary Care Unit (CCU)',
 'first_careunit_Medical Intensive Care Unit (MICU)',
 'first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU)',
 'first_careunit_Neuro Intermediate',
 'first_careunit_Neuro Stepdown',
 'first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU)',
 'first_careunit_Surgical Intensive Care Unit (SICU)',
 'first_careunit_Trauma SICU (TSICU)',
 'admission_location_EMERGENCY ROOM',
 'admission_location_PHYSICIAN REFERRAL',
 'admission_location_PROCEDURE SITE',
 'admission_location_TRANSFER FROM HOSPITAL',
 'admission_location_TRANSFER FROM SKILLED NURSING FACILITY',
 'admission_location_WALK-IN/SE

In [34]:
from collections import Counter
[k for k,v in Counter(list(final.columns)).items() if v>1]

[]

In [35]:
pd.set_option('display.max_columns', 1000)  # or 1000
pd.set_option('display.max_rows', 1000)  # or 1000
pd.set_option('display.max_colwidth', 1000)  # or 199

# Missing values per feature
final.isnull().sum(axis = 0)/1410*100

actualhospitalmortality                                             0.000000
actualiculos                                                        0.000000
gender                                                              0.000000
age                                                                 0.000000
height                                                              0.000000
weight                                                              0.000000
hour                                                                0.000000
eyes                                                                0.000000
motor                                                               0.000000
verbal                                                              0.000000
ethnicity_BLACK/AFRICAN AMERICAN                                    0.000000
ethnicity_OTHER                                                     0.000000
ethnicity_UNKNOWN                                                   0.000000

In [36]:
# Split the data into train and test
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

X = final.drop('actualhospitalmortality', axis=1, inplace=False).to_numpy()
y = final['actualhospitalmortality'].to_numpy()

sss.get_n_splits(X, y)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [1127   79  942  594  325 1043  347  613  717  392  538  759  209  271
  602  232  453  433  588  367  766  365  936  241  848  801  404  448
  737   18  852  270  931  545 1126  567  819  506  650   66 1117  185
  430  398  144 1029  954  422  997  896  150 1018  623  932  410  383
  692  510  289   32  206  878  492  564  843  184  517  790  520  330
  706 1039  820  204   38  375  840  220  792  389  738  529 1057  898
   99  568  118  208 1084  577  548  504  550  440   40  298  388  746
 1089  980  501  156  718  626  691  360  608  845  945  155   26 1092
  162 1080  134   42  227  164  480  481  167  502   93 1013  257  317
   88  891  989  832  559 1137 1066   90  434  161  833  247  590  273
 1094  583 1100    3  327  350  121  265  686  157    0  497  800  809
  673  539  301  970    4  535  414   76 1106  874  395  498  734  870
  418 1067  533  552   11   36 1025 1076   62  743  165  451  470  782
 1001    9  816  181 1033   44  901  785   55   89   48  307  731 1112

In [38]:
np.save('X_train_static_24', X_train)
np.save('X_test_static_24', X_test)
np.save('y_train_static_24', y_train)
np.save('y_test_static_24', y_test)

In [20]:
X_train_static_6 = np.load('X_train_static_6.npy')
X_test_static_6 = np.load('X_test_static_6.npy')
y_train_static_6 = np.load('y_train_static_6.npy')
y_test_static_6 = np.load('y_test_static_6.npy')

In [25]:
# Train and test split
np.shape(X_train_static_6), np.shape(X_test_static_6)

((1128, 174), (282, 174))

In [19]:
# LOS distribution
print((np.mean(X_train_static_6[:,0])/(64*24), np.std(X_train_static_6[:,0])/(64*24), np.median(X_train_static_6[:,0])/(64*24)))
print(np.mean(X_test_static_6[:,0])/(64*24), np.std(X_test_static_6[:,0])/(64*24), np.median(X_test_static_6[:,0])/(64*24))

(3.687854542593208, 4.886174456135874, 2.1766927083333334)
3.2416780175704294 3.2018417858890795 2.0694118923611113


In [13]:
# Women vs Men distribution
unique, counts = np.unique(X_train_static_6[:,1], return_counts=True)
result = dict(zip(unique, counts))
print(result)

unique, counts = np.unique(X_test_static_6[:,1], return_counts=True)
result = dict(zip(unique, counts))
print(result)

{0.0: 543, 1.0: 585}
{0.0: 126, 1.0: 156}


In [15]:
# Age distribution
print(np.mean(X_train_static_6[:,2]), np.std(X_train_static_6[:,2]), np.median(X_train_static_6[:,2]))
print(np.mean(X_test_static_6[:,2]), np.std(X_test_static_6[:,2]), np.median(X_test_static_6[:,2]))

70.8936170212766 14.470538486189023 72.0
71.01418439716312 15.062039935872813 72.0


In [21]:
# Death distribution
unique, counts = np.unique(y_train_static_6, return_counts=True)
result = dict(zip(unique, counts))
print(result)

unique, counts = np.unique(y_test_static_6, return_counts=True)
result = dict(zip(unique, counts))
print(result)

{0.0: 998, 1.0: 130}
{0.0: 250, 1.0: 32}
