In [1]:
from __future__ import print_function

# Import libraries
import numpy as np
import pandas as pd
import sklearn
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

# to display dataframes in notebooks
# from IPython.display import display, HTML

from collections import OrderedDict

# used to print out pretty pandas dataframes

from sklearn.pipeline import Pipeline

# used to impute mean for data and standardize for computational stability
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

# used to calculate AUROC/accuracy
from sklearn import metrics

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

#import matplotlib
#import matplotlib.pyplot as plt
#from matplotlib.font_manager import FontProperties # for unicode fonts
#%matplotlib inline

# below config used on pc70
#sqluser = 'alistairewj'
#dbname = 'mimic'
#schema_name = 'mimiciii'
#query_schema = 'SET search_path to public,' + schema_name + ';'


# two options for loading data
# option 1) use SQL - requires database and to have run queries/make_all.sql
# option 2) use CSVs downloaded
USE_SQL=0
USE_CSV=1

In [2]:
if USE_SQL:
    # Connect to local postgres version of mimic
    con = psycopg2.connect(dbname=dbname, user=sqluser)

    # exclusion criteria:
    #   - less than 15 years old
    #   - stayed in the ICU less than 4 hours
    #   - never have any chartevents data (i.e. likely administrative error)
    #   - organ donor accounts (administrative "readmissions" for patients who died in hospital)
    query = query_schema + \
    """
    select 
        *
    from dm_cohort
    """
    co = pd.read_sql_query(query,con)
    
    # convert the inclusion flags to boolean
    for c in co.columns:
        if c[0:10]=='inclusion_':
            co[c] = co[c].astype(bool)

    # extract static vars into a separate dataframe
    df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
    #for dtvar in ['intime','outtime','deathtime']:
    #    df_static[dtvar] = pd.to_datetime(df_static[dtvar])

    vars_static = [u'is_male', u'emergency_admission', u'age',
                   # services
                   u'service_any_noncard_surg',
                   u'service_any_card_surg',
                   u'service_cmed',
                   u'service_traum',
                   u'service_nmed',
                   # ethnicities
                   u'race_black',u'race_hispanic',u'race_asian',u'race_other',
                   # phatness
                   u'height', u'weight', u'bmi']


    # get ~5 million rows containing data from errbody
    # this takes a little bit of time to load into memory (~2 minutes)

    # %%time results
    # CPU times: user 42.8 s, sys: 1min 3s, total: 1min 46s
    # Wall time: 2min 7s

    df = pd.read_sql_query(query_schema + 'select * from mp_data', con)
    df.drop('subject_id',axis=1,inplace=True)
    df.drop('hadm_id',axis=1,inplace=True)
    df.sort_values(['icustay_id','hr'],axis=0,ascending=True,inplace=True)

    # get death information
    df_death = pd.read_sql_query(query_schema + """
    select 
    co.subject_id, co.hadm_id, co.icustay_id
    , ceil(extract(epoch from (co.outtime - co.intime))/60.0/60.0) as dischtime_hours
    , ceil(extract(epoch from (adm.deathtime - co.intime))/60.0/60.0) as deathtime_hours
    , case when adm.deathtime is null then 0 else 1 end as death
    from dm_cohort co
    inner join admissions adm
    on co.hadm_id = adm.hadm_id
    where co.excluded = 0
    """, con)
    
    # get censoring information
    df_censor = pd.read_sql_query(query_schema + """
    select co.icustay_id, min(cs.charttime) as censortime
    , ceil(extract(epoch from min(cs.charttime-co.intime) )/60.0/60.0) as censortime_hours
    from dm_cohort co 
    inner join mp_code_status cs
    on co.icustay_id = cs.icustay_id
    where cmo+dnr+dni+dncpr+cmo_notes>0
    and co.excluded = 0
    group by co.icustay_id
    """, con)
    
    # extract static vars into a separate dataframe
    df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
    
elif USE_CSV:
    co = pd.read_csv('.\data\data_compressed\df_cohort.csv')
    
    # convert the inclusion flags to boolean
    for c in co.columns:
        if c[0:10]=='inclusion_':
            co[c] = co[c].astype(bool)
    df = pd.read_csv('.\data\data_compressed\df_data.csv')
    df_static = pd.read_csv('.\data\data_compressed\df_static_data.csv')
    df_censor = pd.read_csv('.\data\data_compressed\df_censor.csv')
    df_death = pd.read_csv('.\data\data_compressed\df_death.csv')
    
else:
    print('Must use SQL or CSV to load data!')
    
    
print(df.shape)

(6386894, 55)


In [3]:
df = df.drop(df.columns[0], axis=1)
co = co.drop(co.columns[0], axis=1)
df_static = df_static.drop(df_static.columns[0], axis=1)
df_censor = df_censor.drop(df_censor.columns[0], axis=1)
df_death = df_death.drop(df_death.columns[0], axis=1)

# Base exclusion criteria

In [4]:
# print out the exclusions *SEQUENTIALLY* - i.e. if already excluded, don't re-print
print('Cohort - initial size: {} ICU stays'.format(co.shape[0]))

idxRem = np.zeros(co.shape[0],dtype=bool)
for c in co.columns:
    if c[0:len('exclusion_')]=='exclusion_':
        N_REM = np.sum( (co[c].values==1) )
        print('  {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/co.shape[0], c))
        idxRem[co[c].values==1] = True

# summarize all exclusions
N_REM = np.sum(idxRem)
print('  {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/co.shape[0], 'all exclusions'))
print('')
print('Final cohort size: {} ICU stays ({:2.2f}%).'.format(co.shape[0] - np.sum(idxRem), (1-np.mean(idxRem))*100.0))
co = co.loc[~idxRem,:]

Cohort - initial size: 52085 ICU stays
      0 (0.00%) - exclusion_over_15
      0 (0.00%) - exclusion_valid_data
      0 (0.00%) - exclusion_stay_lt_4hr
      0 (0.00%) - exclusion_organ_donor
      0 (0.00%) - all exclusions

Final cohort size: 52085 ICU stays (100.00%).


## Mortality stats

### Mortality in base cohort

In [5]:
# mortality stats for base cohort
for c in co.columns:
    if c[0:len('death_')]=='death_':
        N_ALL = co.shape[0]
        N = co.set_index('icustay_id').loc[:,c].sum()
        print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))

death_48hr_post_icu_admit                1614 of 52085 died (3.10%).
death_icu                                4185 of 52085 died (8.03%).
death_in_hospital                        6192 of 52085 died (11.89%).
death_30dy_post_icu_admit                7567 of 52085 died (14.53%).
death_30dy_post_icu_disch                8081 of 52085 died (15.52%).
death_30dy_post_hos_disch                8633 of 52085 died (16.57%).
death_6mo_post_hos_disch                12788 of 52085 died (24.55%).
death_1yr_post_hos_disch                15052 of 52085 died (28.90%).
death_2yr_post_hos_disch                17758 of 52085 died (34.09%).
death_30dy_post_hos_admit                7124 of 52085 died (13.68%).


### Mortality in MIMIC-II patients staying >= 24 hours

This is mainly an example of how the `inclFcn` works. It derives from the cohort a boolean index of patients to retain in the dataset.

In [6]:
inclFcn = lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_stay_ge_24hr'],'icustay_id']

# mortality stats for base cohort
for c in co.columns:
    if c[0:len('death_')]=='death_':
        N_ALL = inclFcn(co).shape[0]
        N = co.set_index('icustay_id').loc[inclFcn(co),c].sum()
        print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))

death_48hr_post_icu_admit                 405 of 23497 died (1.72%).
death_icu                                2020 of 23497 died (8.60%).
death_in_hospital                        3034 of 23497 died (12.91%).
death_30dy_post_icu_admit                3613 of 23497 died (15.38%).
death_30dy_post_icu_disch                3933 of 23497 died (16.74%).
death_30dy_post_hos_disch                4212 of 23497 died (17.93%).
death_6mo_post_hos_disch                 6205 of 23497 died (26.41%).
death_1yr_post_hos_disch                 7362 of 23497 died (31.33%).
death_2yr_post_hos_disch                 8879 of 23497 died (37.79%).
death_30dy_post_hos_admit                3390 of 23497 died (14.43%).


Here we have the same function in a slightly more obscure way - with the benefit of being able to list all inclusions in a list. This just helps readability in the below code.

In [7]:
inclusions = ['inclusion_only_mimicii', 'inclusion_stay_ge_24hr']
inclFcn = lambda x: x.loc[x[inclusions].all(axis=1),'icustay_id']

# mortality stats for base cohort
for c in co.columns:
    if c[0:len('death_')]=='death_':
        N_ALL = inclFcn(co).shape[0]
        N = co.set_index('icustay_id').loc[inclFcn(co),c].sum()
        print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))

death_48hr_post_icu_admit                 405 of 23497 died (1.72%).
death_icu                                2020 of 23497 died (8.60%).
death_in_hospital                        3034 of 23497 died (12.91%).
death_30dy_post_icu_admit                3613 of 23497 died (15.38%).
death_30dy_post_icu_disch                3933 of 23497 died (16.74%).
death_30dy_post_hos_disch                4212 of 23497 died (17.93%).
death_6mo_post_hos_disch                 6205 of 23497 died (26.41%).
death_1yr_post_hos_disch                 7362 of 23497 died (31.33%).
death_2yr_post_hos_disch                 8879 of 23497 died (37.79%).
death_30dy_post_hos_admit                3390 of 23497 died (14.43%).


# Exclusion criteria

Each study has its own exclusion criteria (sometimes studies have multiple experiments). We define a dictionary of all exclusions with the dictionary key as the study name. Some studies have multiple experiments, so we append *a*, *b*, or *c*.

The dictionary stores a length 2 list. The first element defines the window for data extraction: it contains a dictionary of the windows and the corresponding window sizes. The second element is the exclusion criteria. Both are functions which use `co` or `df` as their input.

In [8]:
# first we can define the different windows: there aren't that many!
df_tmp=co.copy().set_index('icustay_id')

# admission+12 hours
time_12hr = df_tmp.copy()
time_12hr['windowtime'] = 12
time_12hr = time_12hr['windowtime'].to_dict()

# admission+24 hours
time_24hr = df_tmp.copy()
time_24hr['windowtime'] = 24
time_24hr = time_24hr['windowtime'].to_dict()

# admission+48 hours
time_48hr = df_tmp.copy()
time_48hr['windowtime'] = 48
time_48hr = time_48hr['windowtime'].to_dict()

# admission+72 hours
time_72hr = df_tmp.copy()
time_72hr['windowtime'] = 72
time_72hr = time_72hr['windowtime'].to_dict()

# admission+96 hours
time_96hr = df_tmp.copy()
time_96hr['windowtime'] = 96
time_96hr = time_96hr['windowtime'].to_dict()

# entire stay
time_all = df_tmp.copy()
time_all = time_all['dischtime_hours'].apply(np.ceil).astype(int).to_dict()

# 12 hours before the patient died/discharged
time_predeath = df_tmp.copy()
time_predeath['windowtime'] = time_predeath['dischtime_hours']
idx = time_predeath['deathtime_hours']<time_predeath['dischtime_hours']
time_predeath.loc[idx,'windowtime'] = time_predeath.loc[idx,'deathtime_hours']
# move from discharge/death time to 12 hours beforehand
time_predeath['windowtime'] = time_predeath['windowtime']-12
time_predeath = time_predeath['windowtime'].apply(np.ceil).astype(int).to_dict()

In [9]:
# example params used to extract patient data
# element 1: dictionary specifying end time of window for each patient
# element 2: size of window
# element 3: extra hours added to make it easier to get data on labs (and allows us to get labs pre-ICU)
# e.g. [time_24hr, 8, 24] is
#   (1) window ends at admission+24hr
#   (2) window is 8 hours long
#   (3) lab window is 8+24=32 hours long

def inclFcn(x, inclusions):
    return x.loc[x[inclusions].all(axis=1),'icustay_id']


# this one is used more than once, so we define it here
hugExclFcnMIMIC3 = lambda x: x.loc[x['inclusion_over_18']&x['inclusion_hug2009_obs']&x['inclusion_hug2009_not_nsicu_csicu']&x['inclusion_first_admission']&x['inclusion_full_code']&x['inclusion_not_brain_death']&x['inclusion_not_crf'],'icustay_id'].values
hugExclFcn = lambda x: np.intersect1d(hugExclFcnMIMIC3(x),x.loc[x['inclusion_only_mimicii'],'icustay_id'].values)


# physionet2012 subset - not exact but close
def physChallExclFcn(x):
    out = x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_stay_ge_48hr']&x['inclusion_has_saps'],'icustay_id'].values
    out = np.sort(out)
    out = out[0:4000]
    return out
 
# caballero2015 is a random subsample - then limits to 18yrs, resulting in 11648
def caballeroExclFcn(x):
    out = x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18'],'icustay_id'].values
    out = np.sort(out)
    out = out[0:11648]
    return out

np.random.seed(546345)
W_extra = 24

exclusions = OrderedDict([
['caballero2015dynamically_a',  [[time_24hr, 24, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['caballero2015dynamically_b',  [[time_48hr, 48, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['caballero2015dynamically_c',  [[time_72hr, 72, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['calvert2016computational',    [[time_predeath, 5, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_only_micu']&x['inclusion_calvert2016_obs']&x['inclusion_stay_ge_17hr']&x['inclusion_stay_le_500hr']&x['inclusion_non_alc_icd9'],'icustay_id'].values, 'hospital_expire_flag']],
['calvert2016using',            [[time_predeath, 5, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_only_micu']&x['inclusion_calvert2016_obs']&x['inclusion_stay_ge_17hr']&x['inclusion_stay_le_500hr'],'icustay_id'].values, 'hospital_expire_flag']],
['celi2012database_a',          [[time_72hr, 72, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_aki_icd9'],'icustay_id'].values , 'hospital_expire_flag']],
['celi2012database_b',          [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_sah_icd9'],'icustay_id'].values , 'hospital_expire_flag']],
['che2016recurrent_a',          [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18'],'icustay_id'].values , 'death_48hr_post_icu_admit']],
['che2016recurrent_b',          [[time_48hr, 48, W_extra], physChallExclFcn , 'hospital_expire_flag']],
['ding2016mortality',           [[time_48hr, 48, W_extra], physChallExclFcn , 'hospital_expire_flag']],
['ghassemi2014unfolding_a',     [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2014unfolding_b',     [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2014unfolding_c',     [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['ghassemi2014unfolding_d',     [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['ghassemi2015multivariate_a',    [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_gt_6_notes']&x['inclusion_stay_ge_24hr']&x['inclusion_has_saps'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2015multivariate_b',    [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_gt_6_notes']&x['inclusion_stay_ge_24hr']&x['inclusion_has_saps'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['grnarova2016neural_a',          [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'hospital_expire_flag']],
['grnarova2016neural_b',          [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['grnarova2016neural_c',          [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['harutyunyan2017multitask',    [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_icustay'],'icustay_id'].values, 'hospital_expire_flag']],
['hoogendoorn2016prediction',   [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_hug2009_obs']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['hug2009icu',                  [[time_24hr, 24, W_extra], hugExclFcn, 'death_30dy_post_icu_disch']],
['johnson2012patient',          [[time_48hr, 48, W_extra], physChallExclFcn, 'hospital_expire_flag']],
['johnson2014data',             [[time_48hr, 48, W_extra], physChallExclFcn, 'hospital_expire_flag']],
['joshi2012prognostic',         [[time_24hr, 24, W_extra], hugExclFcn, 'hospital_expire_flag']],
['joshi2016identifiable',       [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_stay_ge_48hr'],'icustay_id'].values, 'hospital_expire_flag']],
['lee2015customization_a',        [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['lee2015customization_b',        [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lee2015customization_c',        [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_2yr_post_hos_disch']],
['lee2015personalized',         [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lee2017patient',              [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lehman2012risk',              [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr']&x['inclusion_first_admission'],'icustay_id'].values, 'hospital_expire_flag']],
['luo2016interpretable_a',        [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_sapsii']&x['inclusion_no_disch_summary'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['luo2016interpretable_b',        [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_sapsii']&x['inclusion_no_disch_summary'],'icustay_id'].values, 'death_6mo_post_hos_disch']],
['luo2016predicting',           [[time_24hr, 12, W_extra], lambda x: np.intersect1d(hugExclFcn(x),x.loc[x['inclusion_stay_ge_24hr'],'icustay_id'].values) , 'death_30dy_post_icu_disch']],
['pirracchio2015mortality',     [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii'],'icustay_id'].values , 'hospital_expire_flag']],
['ripoll2014sepsis',            [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_not_explicit_sepsis'],'icustay_id'].values, 'hospital_expire_flag']],
['wojtusiak2017c',              [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_65']&x['inclusion_alive_hos_disch'],'icustay_id'].values, 'death_30dy_post_hos_disch']]
])

# Compare sample sizes and mortality rates

In [10]:
repro_stats = pd.DataFrame(None, columns=['N_Repro','Y_Repro'])

N = co.shape[0]
    
for current_study in exclusions:
    params, iid_keep, y_outcome_label = exclusions[current_study]
    
    # iid_keep is currently a function - apply it to co to get ICUSTAY_IDs to keep for this study
    iid_keep = iid_keep(co)
    
    N_STUDY = iid_keep.shape[0]
    Y_STUDY = co.set_index('icustay_id').loc[iid_keep,y_outcome_label].mean()*100.0
    
    # print size of cohort in study
    print('{:5g} ({:5.2f}%) - Mortality = {:5.2f}% - {}'.format(
            N_STUDY, N_STUDY*100.0/N, Y_STUDY,
            current_study)
         )
    
    repro_stats.loc[current_study] = [N_STUDY, Y_STUDY]
    

11648 (22.36%) - Mortality = 13.01% - caballero2015dynamically_a
11648 (22.36%) - Mortality = 13.01% - caballero2015dynamically_b
11648 (22.36%) - Mortality = 13.01% - caballero2015dynamically_c
 1985 ( 3.81%) - Mortality = 13.80% - calvert2016computational
18396 (35.32%) - Mortality = 14.71% - calvert2016using
 4741 ( 9.10%) - Mortality = 23.92% - celi2012database_a
 1070 ( 2.05%) - Mortality = 19.16% - celi2012database_b
51986 (99.81%) - Mortality =  3.10% - che2016recurrent_a
 4000 ( 7.68%) - Mortality = 14.35% - che2016recurrent_b
 4000 ( 7.68%) - Mortality = 14.35% - ding2016mortality
23442 (45.01%) - Mortality = 12.92% - ghassemi2014unfolding_a
28169 (54.08%) - Mortality = 12.20% - ghassemi2014unfolding_b
28169 (54.08%) - Mortality = 16.92% - ghassemi2014unfolding_c
28169 (54.08%) - Mortality = 29.76% - ghassemi2014unfolding_d
21969 (42.18%) - Mortality = 13.51% - ghassemi2015multivariate_a
21969 (42.18%) - Mortality = 32.35% - ghassemi2015multivariate_b
29572 (56.78%) - Mortalit

With the above dataframe, `repro_stats`, we can compare our results to those extracted manually from the studies. We load in the manual extraction from the `data` subfolder, merge it with this dataframe, and output to CSV.

In [11]:
study_data = pd.read_csv('./data/study_data.csv')
study_data.set_index('Cohort',inplace=True)
# add in reproduction sample size // outcome
study_data_merged = study_data.merge(repro_stats, how='left',
                left_index=True, right_index=True)


# print out the table as it was in the paper (maybe a bit more precision)
study_data_merged[ ['N_Study','N_Repro','Y_Study','Y_Repro'] ]

Unnamed: 0_level_0,N_Study,N_Repro,Y_Study,Y_Repro
Cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
caballero2015dynamically_a,11648,11648.0,-,13.006525
caballero2015dynamically_b,11648,11648.0,-,13.006525
caballero2015dynamically_c,11648,11648.0,-,13.006525
calvert2016computational,3054,1985.0,12.84,13.803526
calvert2016using,9683,18396.0,10.68,14.70972
celi2012database_a,1400,4741.0,30.7,23.919004
celi2012database_b,223,1070.0,25.6,19.158879
che2016recurrent_a,4000,51986.0,13.85,3.095064
ding2016mortality,4000,4000.0,13.85,14.35
ghassemi2014unfolding_a,19308,23442.0,10.84,12.916987


# Define K-folds for AUROC comparison

In [12]:
# define var_static which is used later
#TODO: should refactor so this isn't needed
var_min, var_max, var_first, var_last, var_sum, var_first_early, var_last_early, var_static = mp.vars_of_interest()

K=5
np.random.seed(871)
# get unique subject_id (this is needed later)
sid = np.sort(np.unique(df_death['subject_id'].values))

# assign k-fold
idxK_sid = np.random.permutation(sid.shape[0])
idxK_sid = np.mod(idxK_sid,K)

# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, df_death['subject_id'].values)

# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]

# Run results for a single study

The below code cell:

* extracts the cohort for a specific study
* extracts the outcome of that study
* builds predictive models in 5-fold cross-validation for that outcome

The two models at the moment are Gradient Boosting (xgboost) and logistic regression (scikit-learn).
This code cell only runs for one study.

In [22]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
# pick the study to run the example on
current_study = 'celi2012database_b'
    
# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = OrderedDict([
          ['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
          #['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
          #['rf', RandomForestClassifier()],
          ['logreg', LogisticRegression(fit_intercept=True)]
         ])

print('')
print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {}==========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))

params = exclusions[current_study][0]
df_data = mp.get_design_matrix(df, params[0], W=params[1], W_extra=params[2])

# get a list of icustay_id who stayed at least 12 hours
iid_keep = exclusions[current_study][1](co)
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
        df_data.shape[0], iid_keep.shape[0], iid_keep.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.reindex(index = iid_keep)
print('')

y_outcome_label = exclusions[current_study][2]

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", SimpleImputer(missing_values=np.nan,
                                          strategy="mean")),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]

        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))


Reducing sample size from 52050 to 1070 (2.06%).

2021-04-13 14:27:35.111426 - Finished fold 1 of 5. AUROC 0.905.
2021-04-13 14:27:36.631151 - Finished fold 2 of 5. AUROC 0.964.
2021-04-13 14:27:38.087684 - Finished fold 3 of 5. AUROC 0.899.
2021-04-13 14:27:39.495161 - Finished fold 4 of 5. AUROC 0.887.
2021-04-13 14:27:40.975378 - Finished fold 5 of 5. AUROC 0.858.
2021-04-13 14:27:41.107021 - Finished fold 1 of 5. AUROC 0.870.
2021-04-13 14:27:41.191346 - Finished fold 2 of 5. AUROC 0.921.
2021-04-13 14:27:41.291119 - Finished fold 3 of 5. AUROC 0.868.
2021-04-13 14:27:41.402315 - Finished fold 4 of 5. AUROC 0.886.
2021-04-13 14:27:41.505054 - Finished fold 5 of 5. AUROC 0.845.


# Run results for all results

The below code block is identical to the above code block, except it loops over all studies evaluated. This code block takes a while - it is training ~150 models of each type. The final AUROCs are output to the `results.txt` file. The final models/results/predictions/targets are saved in various dictionaries with the suffix `_all`.

In [26]:
mdl_val_all = dict()
results_val_all = dict()
pred_val_all = dict()
tar_val_all = dict()

# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = OrderedDict([
          ['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
          #['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
          #['rf', RandomForestClassifier()],
          ['logreg', LogisticRegression(fit_intercept=True)]
         ])
    
with open('results.txt','w') as fp:
    fp.write('StudyName,SampleSize,Outcome')
    for mdl in models:
        fp.write(',{}'.format(mdl))
    fp.write('\n')
    
for current_study in exclusions:    
    print('\n==================== {} =========='.format('='*len(current_study)))
    print('========== BEGINNING {} =========='.format(current_study))
    print('==================== {} =========='.format('='*len(current_study)))

    params = exclusions[current_study][0]
    df_data = mp.get_design_matrix(df, params[0], W=params[1], W_extra=params[2])

    # get a list of icustay_id who stayed at least 12 hours
    iid_keep = exclusions[current_study][1](co)
    print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
            df_data.shape[0], iid_keep.shape[0], iid_keep.shape[0]*100.0 / df_data.shape[0]))
    df_data = df_data.reindex(index = iid_keep)
    print('')

    y_outcome_label = exclusions[current_study][2]
    
    # load the data into a numpy array

    # first, the data from static vars from df_static
    X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
    # next, add in the outcome: death in hospital
    X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

    # map above K-fold indices to this dataset
    X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
    # get indices which map subject_ids in sid to the X dataframe
    idxMap = np.searchsorted(sid, X['subject_id'].values)
    # use these indices to map the k-fold integers
    idxK = idxK_sid[idxMap]
    # drop the subject_id column
    X.drop('subject_id',axis=1,inplace=True)

    # convert to numpy data (assumes target, death, is the last column)
    X = X.values
    y = X[:,-1]
    X = X[:,0:-1]
    X_header = [x for x in df_data.columns.values] + var_static
    
    mdl_val = dict()
    results_val = dict()
    pred_val = dict()
    tar_val = dict()

    for mdl in models:
        print('=============== {} ==============='.format(mdl))
        mdl_val[mdl] = list()
        results_val[mdl] = list() # initialize list for scores
        pred_val[mdl] = list()
        tar_val[mdl] = list()

        if mdl == 'xgb':
            # no pre-processing of data necessary for xgb
            estimator = Pipeline([(mdl, models[mdl])])

        else:
            estimator = Pipeline([("imputer", SimpleImputer(missing_values=np.nan,
                                              strategy="mean")),
                          ("scaler", StandardScaler()),
                          (mdl, models[mdl])]) 

        for k in range(K):
            # train the model using all but the kth fold
            curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

            # get prediction on this dataset
            if mdl == 'lasso':
                curr_prob = curr_mdl.predict(X[idxK == k, :])
            else:
                curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
                curr_prob = curr_prob[:,1]

            pred_val[mdl].append(curr_prob)
            tar_val[mdl].append(y[idxK == k])

            # calculate score (AUROC)
            curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

            # add score to list of scores
            results_val[mdl].append(curr_score)

            # save the current model
            mdl_val[mdl].append(curr_mdl)

            print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))

    # create a pointer for above dicts with new var names
    # we will likely re-use the dicts in subsequent calls for getting model perfomances
    mdl_val_all[current_study] = mdl_val
    results_val_all[current_study] = results_val
    pred_val_all[current_study] = pred_val
    tar_val_all[current_study] = tar_val
    
    # print to file
    with open('results.txt','a') as fp:
        # print study name, sample size and frequency of outcome
        fp.write( '{},{},{:2.2f}'.format(current_study, X.shape[0], np.mean(y)*100.0 ) )
        
        for i, mdl in enumerate(models):
            fp.write(',{:0.6f}'.format( np.mean(results_val[mdl]) ))
        
        fp.write('\n')


Reducing sample size from 52050 to 11648 (22.38%).

2021-04-13 14:30:18.298676 - Finished fold 1 of 5. AUROC 0.896.
2021-04-13 14:30:26.415379 - Finished fold 2 of 5. AUROC 0.897.
2021-04-13 14:30:33.893557 - Finished fold 3 of 5. AUROC 0.912.
2021-04-13 14:30:41.450978 - Finished fold 4 of 5. AUROC 0.910.
2021-04-13 14:30:48.719429 - Finished fold 5 of 5. AUROC 0.915.
2021-04-13 14:30:49.357442 - Finished fold 1 of 5. AUROC 0.885.
2021-04-13 14:30:49.902703 - Finished fold 2 of 5. AUROC 0.887.
2021-04-13 14:30:50.453647 - Finished fold 3 of 5. AUROC 0.880.
2021-04-13 14:30:51.004196 - Finished fold 4 of 5. AUROC 0.894.
2021-04-13 14:30:51.527146 - Finished fold 5 of 5. AUROC 0.901.

Reducing sample size from 52050 to 11648 (22.38%).

2021-04-13 14:31:15.352539 - Finished fold 1 of 5. AUROC 0.916.
2021-04-13 14:31:22.581182 - Finished fold 2 of 5. AUROC 0.914.
2021-04-13 14:31:30.401039 - Finished fold 3 of 5. AUROC 0.927.
2021-04-13 14:31:38.373824 - Finished fold 4 of 5. AUROC 0.925

2021-04-13 14:33:05.741675 - Finished fold 2 of 5. AUROC 0.974.
2021-04-13 14:33:07.531263 - Finished fold 3 of 5. AUROC 0.947.
2021-04-13 14:33:09.385060 - Finished fold 4 of 5. AUROC 0.934.
2021-04-13 14:33:11.129404 - Finished fold 5 of 5. AUROC 0.976.
2021-04-13 14:33:11.289527 - Finished fold 1 of 5. AUROC 0.902.
2021-04-13 14:33:11.445064 - Finished fold 2 of 5. AUROC 0.959.
2021-04-13 14:33:11.594654 - Finished fold 3 of 5. AUROC 0.940.
2021-04-13 14:33:11.788241 - Finished fold 4 of 5. AUROC 0.887.
2021-04-13 14:33:11.981968 - Finished fold 5 of 5. AUROC 0.950.

Reducing sample size from 52042 to 18396 (35.35%).

2021-04-13 14:33:28.797555 - Finished fold 1 of 5. AUROC 0.925.
2021-04-13 14:33:37.647490 - Finished fold 2 of 5. AUROC 0.941.
2021-04-13 14:33:46.946340 - Finished fold 3 of 5. AUROC 0.933.
2021-04-13 14:33:56.214988 - Finished fold 4 of 5. AUROC 0.936.
2021-04-13 14:34:05.954224 - Finished fold 5 of 5. AUROC 0.932.
2021-04-13 14:34:06.667489 - Finished fold 1 of 5. 

2021-04-13 14:35:08.706570 - Finished fold 5 of 5. AUROC 0.858.
2021-04-13 14:35:08.830331 - Finished fold 1 of 5. AUROC 0.870.
2021-04-13 14:35:08.926652 - Finished fold 2 of 5. AUROC 0.921.
2021-04-13 14:35:09.021795 - Finished fold 3 of 5. AUROC 0.868.
2021-04-13 14:35:09.146690 - Finished fold 4 of 5. AUROC 0.886.
2021-04-13 14:35:09.249136 - Finished fold 5 of 5. AUROC 0.845.

Reducing sample size from 52050 to 51986 (99.88%).

2021-04-13 14:35:54.728799 - Finished fold 1 of 5. AUROC 0.988.
2021-04-13 14:36:25.613908 - Finished fold 2 of 5. AUROC 0.989.
2021-04-13 14:36:55.422743 - Finished fold 3 of 5. AUROC 0.984.
2021-04-13 14:37:26.309458 - Finished fold 4 of 5. AUROC 0.983.
2021-04-13 14:37:56.632258 - Finished fold 5 of 5. AUROC 0.986.
2021-04-13 14:37:58.497261 - Finished fold 1 of 5. AUROC 0.969.
2021-04-13 14:38:00.296672 - Finished fold 2 of 5. AUROC 0.967.
2021-04-13 14:38:02.092000 - Finished fold 3 of 5. AUROC 0.965.
2021-04-13 14:38:03.902921 - Finished fold 4 of 5. 

2021-04-13 14:39:27.763875 - Finished fold 1 of 5. AUROC 0.874.
2021-04-13 14:39:38.547446 - Finished fold 2 of 5. AUROC 0.871.
2021-04-13 14:39:50.325168 - Finished fold 3 of 5. AUROC 0.883.
2021-04-13 14:40:02.211890 - Finished fold 4 of 5. AUROC 0.888.
2021-04-13 14:40:14.159293 - Finished fold 5 of 5. AUROC 0.892.
2021-04-13 14:40:15.000911 - Finished fold 1 of 5. AUROC 0.859.
2021-04-13 14:40:15.868141 - Finished fold 2 of 5. AUROC 0.865.
2021-04-13 14:40:16.738417 - Finished fold 3 of 5. AUROC 0.862.
2021-04-13 14:40:17.636319 - Finished fold 4 of 5. AUROC 0.875.
2021-04-13 14:40:18.553728 - Finished fold 5 of 5. AUROC 0.877.

Reducing sample size from 52050 to 28169 (54.12%).

2021-04-13 14:40:41.487518 - Finished fold 1 of 5. AUROC 0.873.
2021-04-13 14:40:55.673674 - Finished fold 2 of 5. AUROC 0.874.
2021-04-13 14:41:09.956697 - Finished fold 3 of 5. AUROC 0.882.
2021-04-13 14:41:23.876297 - Finished fold 4 of 5. AUROC 0.889.
2021-04-13 14:41:36.784953 - Finished fold 5 of 5. 

2021-04-13 14:43:56.709691 - Finished fold 3 of 5. AUROC 0.841.
2021-04-13 14:44:11.036566 - Finished fold 4 of 5. AUROC 0.835.
2021-04-13 14:44:25.019480 - Finished fold 5 of 5. AUROC 0.861.
2021-04-13 14:44:26.123332 - Finished fold 1 of 5. AUROC 0.811.
2021-04-13 14:44:27.260537 - Finished fold 2 of 5. AUROC 0.821.
2021-04-13 14:44:28.331306 - Finished fold 3 of 5. AUROC 0.816.
2021-04-13 14:44:29.393740 - Finished fold 4 of 5. AUROC 0.813.
2021-04-13 14:44:30.427896 - Finished fold 5 of 5. AUROC 0.837.

Reducing sample size from 52050 to 21969 (42.21%).

2021-04-13 14:44:52.111777 - Finished fold 1 of 5. AUROC 0.866.
2021-04-13 14:45:03.419658 - Finished fold 2 of 5. AUROC 0.865.
2021-04-13 14:45:14.609244 - Finished fold 3 of 5. AUROC 0.877.
2021-04-13 14:45:25.795261 - Finished fold 4 of 5. AUROC 0.881.
2021-04-13 14:45:37.266604 - Finished fold 5 of 5. AUROC 0.888.
2021-04-13 14:45:38.074501 - Finished fold 1 of 5. AUROC 0.853.
2021-04-13 14:45:38.902621 - Finished fold 2 of 5. 

2021-04-13 14:48:05.773377 - Finished fold 5 of 5. AUROC 0.983.
2021-04-13 14:48:06.339811 - Finished fold 1 of 5. AUROC 0.975.
2021-04-13 14:48:06.853697 - Finished fold 2 of 5. AUROC 0.979.
2021-04-13 14:48:07.357760 - Finished fold 3 of 5. AUROC 0.981.
2021-04-13 14:48:07.932989 - Finished fold 4 of 5. AUROC 0.978.
2021-04-13 14:48:08.532475 - Finished fold 5 of 5. AUROC 0.978.

Reducing sample size from 52050 to 29572 (56.81%).

2021-04-13 14:48:28.984283 - Finished fold 1 of 5. AUROC 0.955.
2021-04-13 14:48:44.444452 - Finished fold 2 of 5. AUROC 0.960.
2021-04-13 14:48:59.550305 - Finished fold 3 of 5. AUROC 0.963.
2021-04-13 14:49:14.110533 - Finished fold 4 of 5. AUROC 0.960.
2021-04-13 14:49:28.594849 - Finished fold 5 of 5. AUROC 0.960.
2021-04-13 14:49:29.170628 - Finished fold 1 of 5. AUROC 0.948.
2021-04-13 14:49:29.706171 - Finished fold 2 of 5. AUROC 0.954.
2021-04-13 14:49:30.248919 - Finished fold 3 of 5. AUROC 0.957.
2021-04-13 14:49:30.766288 - Finished fold 4 of 5. 

2021-04-13 14:53:19.166306 - Finished fold 1 of 5. AUROC 0.869.
2021-04-13 14:53:27.898567 - Finished fold 2 of 5. AUROC 0.870.
2021-04-13 14:53:35.604524 - Finished fold 3 of 5. AUROC 0.879.
2021-04-13 14:53:43.305065 - Finished fold 4 of 5. AUROC 0.884.
2021-04-13 14:53:51.491496 - Finished fold 5 of 5. AUROC 0.892.
2021-04-13 14:53:51.920346 - Finished fold 1 of 5. AUROC 0.853.
2021-04-13 14:53:52.291438 - Finished fold 2 of 5. AUROC 0.861.
2021-04-13 14:53:52.645914 - Finished fold 3 of 5. AUROC 0.860.
2021-04-13 14:53:53.021583 - Finished fold 4 of 5. AUROC 0.869.
2021-04-13 14:53:53.386712 - Finished fold 5 of 5. AUROC 0.876.

Reducing sample size from 52050 to 10696 (20.55%).

2021-04-13 14:54:04.656760 - Finished fold 1 of 5. AUROC 0.854.
2021-04-13 14:54:11.474481 - Finished fold 2 of 5. AUROC 0.852.
2021-04-13 14:54:17.987513 - Finished fold 3 of 5. AUROC 0.868.
2021-04-13 14:54:24.144907 - Finished fold 4 of 5. AUROC 0.832.
2021-04-13 14:54:30.531012 - Finished fold 5 of 5. 

2021-04-13 14:55:08.978330 - Finished fold 4 of 5. AUROC 0.853.
2021-04-13 14:55:11.389365 - Finished fold 5 of 5. AUROC 0.855.
2021-04-13 14:55:11.499591 - Finished fold 1 of 5. AUROC 0.798.
2021-04-13 14:55:11.598471 - Finished fold 2 of 5. AUROC 0.834.
2021-04-13 14:55:11.688900 - Finished fold 3 of 5. AUROC 0.845.
2021-04-13 14:55:11.789871 - Finished fold 4 of 5. AUROC 0.850.
2021-04-13 14:55:11.876001 - Finished fold 5 of 5. AUROC 0.833.

Reducing sample size from 52050 to 10696 (20.55%).

2021-04-13 14:55:22.127485 - Finished fold 1 of 5. AUROC 0.882.
2021-04-13 14:55:28.311407 - Finished fold 2 of 5. AUROC 0.870.
2021-04-13 14:55:34.789483 - Finished fold 3 of 5. AUROC 0.887.
2021-04-13 14:55:41.039938 - Finished fold 4 of 5. AUROC 0.879.
2021-04-13 14:55:46.996305 - Finished fold 5 of 5. AUROC 0.920.
2021-04-13 14:55:47.250821 - Finished fold 1 of 5. AUROC 0.850.
2021-04-13 14:55:47.432228 - Finished fold 2 of 5. AUROC 0.871.
2021-04-13 14:55:47.651961 - Finished fold 3 of 5. 

Reducing sample size from 52050 to 20961 (40.27%).

2021-04-13 14:58:06.985737 - Finished fold 1 of 5. AUROC 0.862.
2021-04-13 14:58:15.674799 - Finished fold 2 of 5. AUROC 0.858.
2021-04-13 14:58:24.132838 - Finished fold 3 of 5. AUROC 0.871.
2021-04-13 14:58:32.943391 - Finished fold 4 of 5. AUROC 0.864.
2021-04-13 14:58:41.441998 - Finished fold 5 of 5. AUROC 0.865.
2021-04-13 14:58:41.825960 - Finished fold 1 of 5. AUROC 0.843.
2021-04-13 14:58:42.219717 - Finished fold 2 of 5. AUROC 0.849.
2021-04-13 14:58:42.639574 - Finished fold 3 of 5. AUROC 0.853.
2021-04-13 14:58:43.065632 - Finished fold 4 of 5. AUROC 0.850.
2021-04-13 14:58:43.457361 - Finished fold 5 of 5. AUROC 0.853.

Reducing sample size from 52050 to 20961 (40.27%).

2021-04-13 14:58:57.828036 - Finished fold 1 of 5. AUROC 0.845.
2021-04-13 14:59:06.311274 - Finished fold 2 of 5. AUROC 0.834.
2021-04-13 14:59:15.658611 - Finished fold 3 of 5. AUROC 0.833.
2021-04-13 14:59:24.377476 - Finished fold 4 of 5. AUROC 0.816.

2021-04-13 15:01:09.622558 - Finished fold 3 of 5. AUROC 0.869.
2021-04-13 15:01:18.859408 - Finished fold 4 of 5. AUROC 0.870.
2021-04-13 15:01:28.328054 - Finished fold 5 of 5. AUROC 0.870.
2021-04-13 15:01:28.794507 - Finished fold 1 of 5. AUROC 0.843.
2021-04-13 15:01:29.229035 - Finished fold 2 of 5. AUROC 0.848.
2021-04-13 15:01:29.673381 - Finished fold 3 of 5. AUROC 0.848.
2021-04-13 15:01:30.097838 - Finished fold 4 of 5. AUROC 0.855.
2021-04-13 15:01:30.531820 - Finished fold 5 of 5. AUROC 0.856.

Reducing sample size from 52050 to 21738 (41.76%).

2021-04-13 15:01:49.329288 - Finished fold 1 of 5. AUROC 0.875.
2021-04-13 15:01:58.956621 - Finished fold 2 of 5. AUROC 0.882.
2021-04-13 15:02:08.122186 - Finished fold 3 of 5. AUROC 0.886.
2021-04-13 15:02:17.581365 - Finished fold 4 of 5. AUROC 0.889.
2021-04-13 15:02:26.499138 - Finished fold 5 of 5. AUROC 0.893.
2021-04-13 15:02:26.933535 - Finished fold 1 of 5. AUROC 0.859.
2021-04-13 15:02:27.337458 - Finished fold 2 of 5. 

2021-04-13 15:05:03.818562 - Finished fold 1 of 5. AUROC 0.876.
2021-04-13 15:05:04.363163 - Finished fold 2 of 5. AUROC 0.880.
2021-04-13 15:05:04.911172 - Finished fold 3 of 5. AUROC 0.874.
2021-04-13 15:05:05.451170 - Finished fold 4 of 5. AUROC 0.879.
2021-04-13 15:05:06.009181 - Finished fold 5 of 5. AUROC 0.877.

Reducing sample size from 52050 to 8931 (17.16%).

2021-04-13 15:05:15.013268 - Finished fold 1 of 5. AUROC 0.823.
2021-04-13 15:05:19.603343 - Finished fold 2 of 5. AUROC 0.834.
2021-04-13 15:05:24.205083 - Finished fold 3 of 5. AUROC 0.832.
2021-04-13 15:05:29.162875 - Finished fold 4 of 5. AUROC 0.801.
2021-04-13 15:05:33.907797 - Finished fold 5 of 5. AUROC 0.849.
2021-04-13 15:05:34.139437 - Finished fold 1 of 5. AUROC 0.799.
2021-04-13 15:05:34.321283 - Finished fold 2 of 5. AUROC 0.854.
2021-04-13 15:05:34.503144 - Finished fold 3 of 5. AUROC 0.831.
2021-04-13 15:05:34.664800 - Finished fold 4 of 5. AUROC 0.791.
2021-04-13 15:05:34.836600 - Finished fold 5 of 5. A

2021-04-13 15:07:30.423690 - Finished fold 1 of 5. AUROC 0.796.
2021-04-13 15:07:40.709015 - Finished fold 2 of 5. AUROC 0.785.
2021-04-13 15:07:51.012445 - Finished fold 3 of 5. AUROC 0.796.
2021-04-13 15:08:01.044787 - Finished fold 4 of 5. AUROC 0.815.
2021-04-13 15:08:11.951810 - Finished fold 5 of 5. AUROC 0.791.
2021-04-13 15:08:12.466709 - Finished fold 1 of 5. AUROC 0.780.
2021-04-13 15:08:12.921103 - Finished fold 2 of 5. AUROC 0.786.
2021-04-13 15:08:13.345421 - Finished fold 3 of 5. AUROC 0.789.
2021-04-13 15:08:13.818061 - Finished fold 4 of 5. AUROC 0.805.
2021-04-13 15:08:14.262605 - Finished fold 5 of 5. AUROC 0.809.


Move the results from the above dictionaries into a single dataframe.


In [27]:
from IPython.display import display, HTML

In [31]:
mdl_list = models.keys()

study_data = pd.read_csv('./data/study_data.csv')
study_data.set_index('Cohort',inplace=True)

# add in reproduction stats from earlier
study_data_merged = study_data.merge(repro_stats, how='left',
                left_index=True, right_index=True)

# add in AUROCs
for current_study in results_val_all:
    results_val = results_val_all[current_study]
    for mdl in results_val:
        study_data_merged.loc[current_study, mdl] = np.mean(results_val[mdl])
    
columns = ['Outcome','N_Study','N_Repro','Y_Study','Y_Repro','AUROC_Study', 'xgb', 'logreg']
display(HTML(study_data_merged[columns].to_html()))

study_data_merged.to_csv('results_final.csv')

Unnamed: 0_level_0,Outcome,N_Study,N_Repro,Y_Study,Y_Repro,AUROC_Study,xgb,logreg
Cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
caballero2015dynamically_a,death_in_hospital,11648,11648.0,-,13.006525,0.8657,0.90606,0.889313
caballero2015dynamically_b,death_in_hospital,11648,11648.0,-,13.006525,0.7985,0.921992,0.905757
caballero2015dynamically_c,death_in_hospital,11648,11648.0,-,13.006525,0.7385,0.931524,0.916911
calvert2016computational,death_in_hospital,3054,1985.0,12.84,13.803526,0.934,0.957939,0.927694
calvert2016using,death_in_hospital,9683,18396.0,10.68,14.70972,0.88,0.933606,0.913535
celi2012database_a,death_in_hospital,1400,4741.0,30.7,23.919004,0.875,0.881971,0.877639
celi2012database_b,death_in_hospital,223,1070.0,25.6,19.158879,0.958,0.902528,0.8781
che2016recurrent_a,death_in_hospital,4000,51986.0,13.85,3.095064,0.8424,0.985854,0.96617
ding2016mortality,death_in_hospital,4000,4000.0,13.85,14.35,0.8177,0.8438,0.831932
ghassemi2014unfolding_a,death_in_hospital,19308,23442.0,10.84,12.916987,0.84,0.881625,0.867505


# Appendix

## Baseline model 1

The below code block builds a "baseline" model. This model has no exclusions past the base cohort exclusions. K-fold validation is done on the patient level to ensure no information leakage between training/validation sets. The outcome is in-hospital mortality, and the data window used is the first 24 hours. Labs are extracted from up to 24 hours before the ICU admission (this is defined by `W_extra=24`).

In [32]:
W = 24 # window size
W_extra = 24 # extra time backward for labs
y_outcome_label = 'death_in_hospital'

# admission+W hours
df_tmp=co.copy().set_index('icustay_id')
time_dict = df_tmp.copy()
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()


# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = OrderedDict([
          ['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
          #['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
          #['rf', RandomForestClassifier()],
          ['logreg', LogisticRegression(fit_intercept=True)]
         ])

In [35]:
CENSOR_FLAG=False
current_study = 'baseline'

print('')
print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {} =========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))

# optionally remove patients who were DNR in first 24hrs
if CENSOR_FLAG:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
else:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
    
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)

iid_keep = exclFcn(co)

N_NEW=df_data.reindex(index = iid_keep).shape[0]
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
        co.shape[0], N_NEW, N_NEW*100.0 / df_data.shape[0]))
df_data = df_data.reindex(index = iid_keep)
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", SimpleImputer(missing_values=np.nan,
                                          strategy="mean")),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]

        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        


# print final results
print('')
print('StudyName,SampleSize',end='')
for mdl in models:
    print(',{}'.format(mdl),end='')
print('')

print( '{},{}'.format(current_study, X.shape[0] ), end='' )

for i, mdl in enumerate(models):
    print(',{:0.6f}'.format( np.mean(results_val[mdl]) ), end='')
print('\n')


Reducing sample size from 52085 to 38687 (74.33%).

2021-04-13 15:13:38.842053 - Finished fold 1 of 5. AUROC 0.880.
2021-04-13 15:13:51.591022 - Finished fold 2 of 5. AUROC 0.889.
2021-04-13 15:14:04.812585 - Finished fold 3 of 5. AUROC 0.888.
2021-04-13 15:14:17.173868 - Finished fold 4 of 5. AUROC 0.886.
2021-04-13 15:14:29.277001 - Finished fold 5 of 5. AUROC 0.897.
2021-04-13 15:14:30.042586 - Finished fold 1 of 5. AUROC 0.869.
2021-04-13 15:14:30.793295 - Finished fold 2 of 5. AUROC 0.872.
2021-04-13 15:14:31.520911 - Finished fold 3 of 5. AUROC 0.864.
2021-04-13 15:14:32.268598 - Finished fold 4 of 5. AUROC 0.863.
2021-04-13 15:14:33.015523 - Finished fold 5 of 5. AUROC 0.882.

StudyName,SampleSize,xgb,logreg
baseline,38687,0.887931,0.870068



## Baseline model 2: No care withdrawal patients

Patients who choose to have their care withdrawn will receive palliative measures in the ICU. These patients show markedly different physiology than those undergoing full interventions and a model which synthesizes severity should not incorporate their data. Here we remove data for patients at the time of their withdrawal of care. If this is before the end of the first 24 hours of their ICU admission, we remove the patient entirely.

In [38]:
CENSOR_FLAG = True
current_study = 'baseline_withdrawal'

print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {} =========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))

# optionally remove patients who were DNR in first 24hrs
if CENSOR_FLAG:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
else:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
    
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)

iid_keep = exclFcn(co)
N_NEW=df_data.reindex(index = iid_keep).shape[0]

print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
        co.shape[0], N_NEW, N_NEW*100.0 / df_data.shape[0]))
df_data = df_data.reindex(index = iid_keep)
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", SimpleImputer(missing_values=np.nan,
                                          strategy="mean")),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]

        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        


# print final results
print('')
print('StudyName,SampleSize',end='')
for mdl in models:
    print(',{}'.format(mdl),end='')
print('')

print( '{},{}'.format(current_study, X.shape[0] ), end='' )

for i, mdl in enumerate(models):
    print(',{:0.6f}'.format( np.mean(results_val[mdl]) ), end='')

print('\n')

Reducing sample size from 52085 to 38687 (74.33%).

2021-04-13 15:17:49.245338 - Finished fold 1 of 5. AUROC 0.880.
2021-04-13 15:18:01.303536 - Finished fold 2 of 5. AUROC 0.889.
2021-04-13 15:18:13.936205 - Finished fold 3 of 5. AUROC 0.888.
2021-04-13 15:18:27.188823 - Finished fold 4 of 5. AUROC 0.886.
2021-04-13 15:18:40.073697 - Finished fold 5 of 5. AUROC 0.897.
2021-04-13 15:18:40.901380 - Finished fold 1 of 5. AUROC 0.869.
2021-04-13 15:18:41.740106 - Finished fold 2 of 5. AUROC 0.872.
2021-04-13 15:18:42.520162 - Finished fold 3 of 5. AUROC 0.864.
2021-04-13 15:18:43.362078 - Finished fold 4 of 5. AUROC 0.863.
2021-04-13 15:18:44.199939 - Finished fold 5 of 5. AUROC 0.882.

StudyName,SampleSize,xgb,logreg
baseline_withdrawal,38687,0.887931,0.870068

