### That's the first script after pre-processing with R 

#### Train-val-test splits are already provided in csv files, so this script only creates the required .npy files

In [2]:
import pandas as pd
import numpy as np
import os
import pickle 
np.random.seed(1234)

from sklearn.preprocessing import OneHotEncoder

In [3]:
save_path = "../Data/miiv_fullstays"
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [4]:
load_path = "../load_ICU_datasets/miiv/"
print(os.listdir(load_path))

['ts_data_mask.csv', 'static_data_scaled.csv', 'ts_data_scaled.csv', 'admissions.csv']


In [5]:
#Load all csv files 
df_adm = pd.read_csv(os.path.join(load_path, "admissions.csv"))
df_ts = pd.read_csv(os.path.join(load_path, "ts_data_scaled.csv"))
df_ts_mask = pd.read_csv(os.path.join(load_path, "ts_data_mask.csv"))
df_static = pd.read_csv(os.path.join(load_path, "static_data_scaled.csv"))

In [8]:
df_adm.set_index("stay_id", inplace=True)
df_static.set_index("stay_id", inplace=True)

In [9]:
#Get train-val-test ids from admissions table
train_ids = df_adm[df_adm.splitType == "train"].index.values
val_ids = df_adm[df_adm.splitType == "val"].index.values
test_ids = df_adm[df_adm.splitType == "test"].index.values

#Shuffle indices 
np.random.shuffle(train_ids)
np.random.shuffle(val_ids)
np.random.shuffle(test_ids)

In [14]:
#Start with mortality labels
mortality_train = df_adm.loc[train_ids].hospital_expire_flag.values
mortality_val = df_adm.loc[val_ids].hospital_expire_flag.values
mortality_test = df_adm.loc[test_ids].hospital_expire_flag.values

In [17]:
def organize_static_features(enc, df, split_ids):
    """
    This function takes all features until adm_categ as usual, then appends one-hot-encoded adm_categ and 
    finally appends the rest of the features
    """
    col_index_adm = np.where(df_static.columns == "adm_categ")[0][0]
    
    subset_df = df.loc[split_ids]
    
    before_adm = subset_df.iloc[:,:col_index_adm].values
    after_adm = subset_df.iloc[:,(col_index_adm+1):].values
    
    adm_categ = subset_df.adm_categ.values.reshape(-1,1)
    adm_onehot = enc.transform(adm_categ).toarray()
    
    all_static_features = np.c_[before_adm, adm_onehot, after_adm]
    return all_static_features

In [18]:
#Static Features 

#We will one-hot-encode admissiontype
adm_encoder = OneHotEncoder()
max_adm_type = df_static.adm_categ.max()
adm_encoder.fit(np.arange(max_adm_type+1).reshape(-1,1))

static_train = organize_static_features(adm_encoder, df_static, train_ids)
static_val = organize_static_features(adm_encoder, df_static, val_ids)
static_test = organize_static_features(adm_encoder, df_static, test_ids)

In [26]:
from tqdm import tqdm

#Time series features
timeseries_train = []
for id_ in tqdm(train_ids):
    timeseries_id = df_ts[df_ts.stay_id == id_].sort_values(by="charttime").iloc[:,2:].values
    timeseries_train.append(timeseries_id)
timeseries_train = np.array(timeseries_train)

timeseries_val = []
for id_ in tqdm(val_ids):
    timeseries_id = df_ts[df_ts.stay_id == id_].sort_values(by="charttime").iloc[:,2:].values
    timeseries_val.append(timeseries_id)
timeseries_val = np.array(timeseries_val)

timeseries_test = []
for id_ in tqdm(test_ids):
    timeseries_id = df_ts[df_ts.stay_id == id_].sort_values(by="charttime").iloc[:,2:].values
    timeseries_test.append(timeseries_id)
timeseries_test = np.array(timeseries_test)

100%|██████████| 34546/34546 [02:16<00:00, 252.41it/s]
  
100%|██████████| 7403/7403 [00:29<00:00, 254.31it/s]
  
100%|██████████| 7402/7402 [00:26<00:00, 276.69it/s]


In [31]:
#Time series masks
timeseries_train_mask = []
for id_ in train_ids:
    timeseries_id = df_ts_mask[df_ts_mask.stay_id == id_].sort_values(by="charttime").iloc[:,2:].values
    timeseries_train_mask.append(timeseries_id)
timeseries_train_mask = np.array(timeseries_train_mask)

timeseries_val_mask = []
for id_ in val_ids:
    timeseries_id = df_ts_mask[df_ts_mask.stay_id == id_].sort_values(by="charttime").iloc[:,2:].values
    timeseries_val_mask.append(timeseries_id)
timeseries_val_mask = np.array(timeseries_val_mask)

timeseries_test_mask = []
for id_ in test_ids:
    timeseries_id = df_ts_mask[df_ts_mask.stay_id == id_].sort_values(by="charttime").iloc[:,2:].values
    timeseries_test_mask.append(timeseries_id)
timeseries_test_mask = np.array(timeseries_test_mask)

  
  if sys.path[0] == '':


In [32]:
#SAVE ALL THE NUMPY ARRAYS

np.save(os.path.join(save_path, "mortality_train.npy"), mortality_train)
np.save(os.path.join(save_path, "mortality_val.npy"), mortality_val)
np.save(os.path.join(save_path, "mortality_test.npy"), mortality_test)

np.save(os.path.join(save_path, "static_train.npy"), static_train)
np.save(os.path.join(save_path, "static_val.npy"), static_val)
np.save(os.path.join(save_path, "static_test.npy"), static_test)

np.save(os.path.join(save_path, "timeseries_train.npy"), timeseries_train)
np.save(os.path.join(save_path, "timeseries_val.npy"), timeseries_val)
np.save(os.path.join(save_path, "timeseries_test.npy"), timeseries_test)

np.save(os.path.join(save_path, "timeseries_train_mask.npy"), timeseries_train_mask)
np.save(os.path.join(save_path, "timeseries_val_mask.npy"), timeseries_val_mask)
np.save(os.path.join(save_path, "timeseries_test_mask.npy"), timeseries_test_mask)

In [33]:
dict_splits = dict()
dict_splits["train"] = train_ids
dict_splits["val"] = val_ids
dict_splits["test"] = test_ids

In [34]:
import pickle

with open(os.path.join(save_path, "split_stay_ids.pkl"), 'wb') as handle:
    pickle.dump(dict_splits, handle, protocol=pickle.HIGHEST_PROTOCOL)