
## Data Preprocessing (used from https://github.com/MLforHealth/MIMIC_Extract)

In [None]:
from __future__ import print_function, division
import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter


from mmd_grud_utils import *

In [3]:
DATA_FILEPATH     = 'all_hourly_data.h5'
RAW_DATA_FILEPATH = 'all_hourly_data.h5'
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']
GPU               = '2'

os.environ['CUDA_VISIBLE_DEVICES'] = GPU
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x143d4ae1570>

In [4]:
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [5]:
%%time
data_full_lvl2 = pd.read_hdf(DATA_FILEPATH, 'vitals_labs')
data_full_raw  = pd.read_hdf(RAW_DATA_FILEPATH, 'vitals_labs') 
statics        = pd.read_hdf(DATA_FILEPATH, 'patients')

Wall time: 58.7 s


In [6]:
statics = statics[statics.age<= 100]
data_full_lvl2 = data_full_lvl2.loc[statics.index.get_level_values(0).unique()]
data_full_raw = data_full_raw.loc[statics.index.get_level_values(0).unique()]

statics.drop(27994,axis = 0,inplace = True)
data_full_lvl2.drop(27994,axis = 0,inplace = True)
data_full_raw.drop(27994,axis = 0,inplace = True)

In [10]:
def simple_imputer(df):
    idx = pd.IndexSlice
    df = df.copy()
    if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))
    
    df_out = df.loc[:, idx[:, ['mean', 'count']]]
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()
    
    df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)
    
    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)
    
    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)
    
    df_out.sort_index(axis=1, inplace=True)
    return df_out

In [11]:
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)

lvl2, raw = [df[
    (df.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) &
    (df.index.get_level_values('hours_in') < WINDOW_SIZE)
] for df in (data_full_lvl2, data_full_raw)]

#raw.columns = raw.columns.droplevel(level=['label', 'LEVEL1', 'LEVEL2'])

train_frac, dev_frac, test_frac = 0.7, 0.1, 0.2
lvl2_subj_idx, raw_subj_idx, Ys_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, raw, Ys)]
lvl2_subjects = set(lvl2_subj_idx)
assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"
assert lvl2_subjects == set(raw_subj_idx), "Subject ID pools differ!"

np.random.seed(SEED)
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)
N_train, N_dev, N_test = int(train_frac * N), int(dev_frac * N), int(test_frac * N)
train_subj = subjects[:N_train]
dev_subj   = subjects[N_train:N_train + N_dev]
test_subj  = subjects[N_train+N_dev:]

[(lvl2_train, lvl2_dev, lvl2_test), (raw_train, raw_dev, raw_test), (Ys_train, Ys_dev, Ys_test)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
    for df in (lvl2, raw, Ys)
]

idx = pd.IndexSlice
lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:,'mean']].mean(axis=0), lvl2_train.loc[:, idx[:,'mean']].std(axis=0)
raw_means, raw_stds = raw_train.loc[:, idx[:,'mean']].mean(axis=0), raw_train.loc[:, idx[:,'mean']].std(axis=0)

lvl2_train.loc[:, idx[:,'mean']] = (lvl2_train.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_dev.loc[:, idx[:,'mean']] = (lvl2_dev.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_test.loc[:, idx[:,'mean']] = (lvl2_test.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds

raw_train.loc[:, idx[:,'mean']] = (raw_train.loc[:, idx[:,'mean']] - raw_means)/raw_stds
raw_dev.loc[:, idx[:,'mean']] = (raw_dev.loc[:, idx[:,'mean']] - raw_means)/raw_stds
raw_test.loc[:, idx[:,'mean']] = (raw_test.loc[:, idx[:,'mean']] - raw_means)/raw_stds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


In [12]:
raw_train, raw_dev, raw_test, lvl2_train, lvl2_dev, lvl2_test = [
    simple_imputer(df) for df in (raw_train, raw_dev, raw_test, lvl2_train, lvl2_dev, lvl2_test)
]
raw_flat_train, raw_flat_dev, raw_flat_test, lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test = [
    df.pivot_table(index=['subject_id', 'hadm_id', 'icustay_id'], columns=['hours_in']) for df in (
        raw_train, raw_dev, raw_test, lvl2_train, lvl2_dev, lvl2_test
    )
]

for df in lvl2_train, lvl2_dev, lvl2_test, raw_train, raw_dev, raw_test: assert not df.isnull().any().any()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Add los>7 and los>3 classes

In [13]:
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)
[(Ys_train, Ys_dev, Ys_test)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
    for df in (Ys,)
]

train_subject_id = lvl2_train.index.get_level_values(0)
dev_subject_id = lvl2_dev.index.get_level_values(0)
test_subject_id = lvl2_test.index.get_level_values(0)

intervention = pd.read_hdf('all_hourly_data.h5', 'interventions')
patients = pd.read_hdf('all_hourly_data.h5', 'patients')

### Normalize patient age and categorise ethnicities



In [17]:
patients_max_age = patients.age.max()
patients['age'] = patients.age/patients_max_age

def categorize_ethnicity(ethnicity):
    if 'ASIAN' in ethnicity:
        ethnicity = 'ASIAN'
    elif 'WHITE' in ethnicity:
        ethnicity = 'WHITE'
    elif 'HISPANIC' in ethnicity:
        ethnicity = 'HISPANIC/LATINO'
    elif 'BLACK' in ethnicity:
        ethnicity = 'BLACK'
#    elif 'AMERICAN INDIAN' in ethnicity:
#        ethnicity = 'AMERICAN INDIAN'
    else: 
        ethnicity = 'OTHER'
    return ethnicity

patients['ethnicity'] = patients.ethnicity.apply(lambda x : categorize_ethnicity(x))

### Save files as pickle

In [22]:
intervention_train = intervention.loc[train_subject_id.unique()]
patients_train = patients.loc[train_subject_id.unique()]

intervention_test = intervention.loc[test_subject_id.unique()]
patients_test = patients.loc[test_subject_id.unique()]

intervention_dev = intervention.loc[dev_subject_id.unique()]
patients_dev = patients.loc[dev_subject_id.unique()]

lvl2_train.to_pickle('vital_train.pkl')
lvl2_dev.to_pickle('vital_dev.pkl')
lvl2_test.to_pickle('vital_test.pkl')

intervention_train.to_pickle('intervention_train.pkl')
intervention_test.to_pickle('intervention_test.pkl')
intervention_dev.to_pickle('intervention_dev.pkl')

patients_train = patients_train[['gender','ethnicity','age','insurance']]
patients_test = patients_test[['gender','ethnicity','age','insurance']]
patients_dev = patients_dev[['gender','ethnicity','age','insurance']]

patients_train.to_pickle('patients_train.pkl')
patients_test.to_pickle('patients_test.pkl')
patients_dev.to_pickle('patients_dev.pkl')

Ys_train.to_pickle('y_train.pkl')
Ys_test.to_pickle('y_test.pkl')
Ys_dev.to_pickle('y_dev.pkl')

### Save as .h5 file

In [None]:
hdf_file = 'mimic3Processed.h5'

patients_train.to_hdf(hdf_file,format = "table",key = "patients_train")
patients_dev.to_hdf(hdf_file,format = "table",key = "patients_dev")
patients_test.to_hdf(hdf_file,format = "table",key = "patients_test")

vitals_train.to_hdf(hdf_file,format = "fixed",key = "vitals_train")
vitals_dev.to_hdf(hdf_file,format = "fixed",key = "vitals_dev")
vitals_test.to_hdf(hdf_file,format = "fixed",key = "vitals_test")

intervention_train.to_hdf(hdf_file,format = "table",key = "intervention_train")
intervention_dev.to_hdf(hdf_file,format = "table",key = "intervention_dev")
intervention_test.to_hdf(hdf_file,format = "table",key = "intervention_test")

y_train.to_hdf(hdf_file,format = "table",key = "y_train")
y_dev.to_hdf(hdf_file,format = "table",key = "y_dev")
y_test.to_hdf(hdf_file,format = "table",key = "y_test")