In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
struc_columns = ['heartrate', 'spo2', 'tempc', 'sysbp', 'meanbp', 'diasbp', 'resprate',
       'baseexcess', 'bicarbonate', 'fio2', 'pf', 'pco2', 'so2', 'bun',
       'calcium', 'chloride', 'creatinine', 'glucose', 'lactate', 'potassium',
       'bilirubin_total', 'hct', 'hgb', 'ptt', 'wbc', 'platelets', 'admission_age']



In [3]:
icustays = [int(i.split('.')[0]) for i in os.listdir('inputs')]
icustays.sort()
icustay_id_2_index = {icustay_id: i for i, icustay_id in enumerate(icustays)}

with open('ids/icustay_id_2_index.txt', 'w') as fp:
    for icu, index in icustay_id_2_index.items():
        fp.write(str(icu)+ ' ' +  str(index) +'\n')

In [4]:
n_dim = len(struc_columns) + 768 #structured variables + bert dimension
n_class = 1

In [6]:
num_folds = 8

with open('ids/test_ids.txt', 'r') as file:
    # Read all lines from the file
    lines = file.readlines()
    # Strip whitespace from the end of each line and store them in a list
    test_ids = [icustay_id_2_index[int(line.strip())] for line in lines]

fold_dict = {fold: {'ids':{'train': None, 'validation': None, 'test': np.array(test_ids)}, 'means': None, 'stds': None} for fold in range(num_folds)}

for fold in range(num_folds):

    with open('ids/fold_%i_train.txt'%fold, 'r') as file:
        # Read all lines from the file
        lines = file.readlines()
        # Strip whitespace from the end of each line and store them in a list
        values = [icustay_id_2_index[int(line.strip())] for line in lines]
        fold_dict[fold]['ids']['train'] = np.array(values)
    with open('ids/fold_%i_test.txt'%fold, 'r') as file:
        # Read all lines from the file
        lines = file.readlines()
        # Strip whitespace from the end of each line and store them in a list
        values = [icustay_id_2_index[int(line.strip())] for line in lines]
        fold_dict[fold]['ids']['validation'] = np.array(values)    ###actually is validation

    with open('ids/fold_%i_means.csv'%fold, 'r') as file:
        # Read all lines from the file
        lines = file.readlines()
        # Strip whitespace from the end of each line and store them in a list
        values = [float(line.split(',')[1].strip()) for line in lines]
        values += [0 for i in range(768)]
        fold_dict[fold]['means'] = np.array(values)    ###actually is validation

    with open('ids/fold_%i_stds.csv'%fold, 'r') as file:
        # Read all lines from the file
        lines = file.readlines()
        # Strip whitespace from the end of each line and store them in a list
        values = [float(line.split(',')[1].strip()) for line in lines]
        values += [1 for i in range(768)]
        fold_dict[fold]['stds'] = np.array(values)    ###actually is validation

fold_taskname = np.array([np.array([fold_dict[i]['ids']['train'], fold_dict[i]['ids']['validation'], fold_dict[i]['ids']['test']], dtype = object) for i in range(num_folds)])
mean_taskname = np.zeros((num_folds, 3, n_dim)) * np.nan
std_taskname = np.zeros((num_folds, 3, n_dim)) * np.nan

for i in range(num_folds):
    mean_taskname[i][0] = fold_dict[i]['means']
    std_taskname[i][0] = fold_dict[i]['stds']

In [7]:
np.savez('fold.npz', fold_taskname = fold_taskname, mean_taskname = mean_taskname, std_taskname = std_taskname)

In [8]:
inputs = []
maskings = []
timestamps = []
labels = []

for icustay_id in tqdm(icustays):

    file_path = 'inputs/' + str(icustay_id) + '.csv'
    X = pd.read_csv(file_path, index_col = 0)
    y = X['sepsis_onset'].max()
    X = X.drop(['sepsis_in_4_hours', 'sepsis4_or_discharge_hr', 'sepsis_onset'], axis = 1)
    s = np.array(X.index)
    X = X.loc[:,[i for i in X.columns if '_clinical_kb_bert' not in i]]
    M = X.notna().applymap(int).values
    X = X.values
    inputs.append(X)
    maskings.append(M)
    timestamps.append(s)
    labels.append(y)

  0%|          | 0/28102 [00:00<?, ?it/s]

100%|██████████| 28102/28102 [30:20<00:00, 15.44it/s]


In [9]:
input = np.array(inputs, dtype=object)
del inputs
masking = np.array(maskings, dtype=object)
del maskings
timestamp = np.array(timestamps, dtype=object)
del timestamps
label_taskname = np.array(labels)
del labels

np.savez('data.npz', input = input, masking = masking, timestamp = timestamp, label_taskname = label_taskname)