In [1]:
import pandas as pd
from itertools import islice
import random
import numpy as np
import json
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit

seed = 250
random.seed(seed)
np.random.seed(seed)

import seaborn as sn
sn.set_theme(style="white", palette="rocket_r")

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd ..

/


In [3]:
cd 'content/gdrive/MyDrive/DynaGraph/HIRID'

/content/gdrive/MyDrive/DynaGraph/HIRID


In [4]:
index = pd.read_csv('imputed_stage_index.csv')

In [5]:
variable_names = ['heart rate', 'ABP systolic', 'ABP diastolic', 'ABP mean', 'CO', 'SpO2', 'RASS', 'Ventilator peak pressure', 'Lactate (blood gas)',
                  'venous lactate', 'INR', 'Blood Glucose', 'C-reactive protein', 'Dobutamine', 'Milrinone', 'Levosimendan', 'Theophyllin', 'Other pain killers']

In [6]:
import glob
import os

dir = 'csv'

li = []

for file in os.listdir(dir):
  if file.endswith(".csv"):
    df = pd.read_csv(os.path.join('/content/gdrive/MyDrive/DynaGraph/HIRID/csv' , file), index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [7]:
frame.reset_index(inplace=True)
frame.set_index(['patientid', 'reldatetime'], inplace=True)
frame = frame.rename_axis(['patient','time'])

In [8]:
frame.drop(labels=['index'], axis=1, inplace=True)

In [9]:
static = pd.read_csv('general_table.csv', index_col=['patientid'])
static = static.rename_axis(['patient'])

In [10]:
static.drop(labels=['admissiontime'], axis=1, inplace=True)

In [11]:
static['sex'].replace(['M','F'],[1,0], inplace=True)

In [12]:
static['discharge_status'].replace(['alive','dead'],[0,1], inplace=True)

In [13]:
static

Unnamed: 0_level_0,sex,age,discharge_status
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,75,0.0
2,1,80,0.0
3,1,55,0.0
4,1,75,0.0
5,1,20,0.0
...,...,...,...
33901,1,70,0.0
33902,0,75,0.0
33903,1,65,0.0
33904,0,70,0.0


In [14]:
static.dropna(inplace=True)

In [15]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,vm1,vm3,vm4,vm5,vm13,vm20,vm28,vm62,vm136,vm146,vm172,vm174,vm176,pm41,pm42,pm43,pm44,pm87
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
190,0.0,70.0,125.0,75.0,90.0,5.2,100.0,1.0,0.00,1.0,1.0,0.95,5.0,4.0,0.0,0.0,0.0,0.0,0.0
190,0.0,70.0,125.0,75.0,90.0,5.3,100.0,1.0,0.00,1.0,1.0,0.95,5.0,4.0,0.0,0.0,0.0,0.0,0.0
190,300.0,69.0,125.0,75.0,90.0,5.2,100.0,1.0,0.00,1.0,1.0,0.95,5.0,4.0,0.0,0.0,0.0,0.0,0.0
190,300.0,69.0,125.0,75.0,90.0,5.3,100.0,1.0,0.00,1.0,1.0,0.95,5.0,4.0,0.0,0.0,0.0,0.0,0.0
190,600.0,69.0,125.0,75.0,90.0,5.3,99.0,1.0,0.00,1.0,1.0,0.95,5.0,4.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33088,161100.0,82.0,128.0,65.0,87.0,5.2,98.0,0.0,16.25,0.8,2.1,1.20,8.1,176.0,0.0,0.0,0.0,0.0,0.0
33088,161400.0,79.0,127.0,65.0,85.0,5.2,98.0,0.0,16.25,0.8,2.1,1.20,8.1,176.0,0.0,0.0,0.0,0.0,0.0
33088,161700.0,75.0,117.0,60.0,79.0,5.2,98.0,0.0,16.25,0.8,2.1,1.20,8.1,176.0,0.0,0.0,0.0,0.0,0.0
33088,162000.0,75.0,111.0,58.0,76.0,5.2,97.0,0.0,16.25,0.8,2.1,1.20,8.1,176.0,0.0,0.0,0.0,0.0,1.0


In [16]:
frame.reset_index(level=1, inplace=True)

# Convert the time-stamps into minutes
frame.time = frame.time.astype(int)/(60)

frame.reset_index(inplace=True)
frame.set_index(['patient', 'time'], inplace=True)

In [17]:
len(pd.unique(frame.index.get_level_values(0)))

33905

In [18]:
static['circulatory_failure'] = 0

In [19]:
patients_with_circulatory_failure = frame.index[(frame['vm5'] < 65) & ((frame['vm136'] > 2))].tolist()

In [20]:
patients_with_circulatory_failure = [x[0] for x in patients_with_circulatory_failure]

In [21]:
patients_with_circulatory_failure = list(set(patients_with_circulatory_failure))

In [22]:
len(patients_with_circulatory_failure)

11178

In [23]:
for i in patients_with_circulatory_failure:
  if i in static.index:
    static.loc[i,'circulatory_failure'] = 1

In [24]:
final = frame.merge(static, left_index=True, right_index=True)

In [25]:
final = final.astype('float')

In [26]:
!pip install scikit-multilearn
from skmultilearn.model_selection import iterative_train_test_split



In [27]:
y = final[['discharge_status', 'circulatory_failure']]
y = y.droplevel(1)
y = y[~y.index.duplicated(keep='first')]

In [28]:
X = final.drop(labels=['discharge_status', 'circulatory_failure'], axis=1, inplace=False)

In [29]:
X_ind = X.index.get_level_values(0)
X_ind = X_ind[~X_ind.duplicated(keep='first')]
X_ind = np.tile(X_ind, (2, 1))
X_ind = X_ind.transpose()

In [30]:
# Split the data into train and test

y = y.to_numpy()

X_train_ind, y_train, X_test_ind, y_test = iterative_train_test_split(X_ind, y, test_size = 0.2)
X_train_ind, y_train, X_val_ind, y_val = iterative_train_test_split(X_train_ind, y_train, test_size = 0.2)

In [31]:
del frame
del static
del final

In [32]:
X.reset_index(level=1, inplace=True)
X_train = X.loc[X_train_ind[:, 0].tolist()]
X_val = X.loc[X_val_ind[:, 0].tolist()]
X_test = X.loc[X_test_ind[:, 0].tolist()]

In [33]:
X_train.reset_index(inplace=True)
X_val.reset_index(inplace=True)
X_test.reset_index(inplace=True)

X_train.set_index(['patient', 'time'], inplace=True)
X_val.set_index(['patient', 'time'], inplace=True)
X_test.set_index(['patient', 'time'], inplace=True)

In [34]:
# create time-series input for LSTM of shape [n, timestep, features]
def split_sequence(dataframe, n_steps):
    lstm_input = np.empty((len(dataframe.index.levels[0]), n_steps, 20))
    lstm_input[:] = np.nan
    for i in range(len(dataframe.index.levels[0])):
        sample = dataframe.loc[dataframe.index.levels[0][i].tolist()]
        sequence = sample.to_numpy()
        n_features = sequence.shape[1]
        time_length = sequence.shape[0]

        if n_steps > time_length:
            a = np.empty((n_steps-time_length,n_features))
            for j in range((n_steps-time_length)):
                a[j, :] = sequence[0, :]
            sequence = np.vstack((a,sequence))
        else: sequence = sequence[-n_steps:, :]
        lstm_input[i, :, :] = sequence

    return lstm_input

In [35]:
# Extract data into LSTM timeseries format with 288 5-minute timesteps
X_train = split_sequence(X_train, 288)
X_val = split_sequence(X_val, 288)
X_test = split_sequence(X_test, 288)

In [36]:
np.isnan(X_test).any()

False

In [37]:
np.save('X_train_multilabel_full_250_HIRID', X_train)
np.save('X_val_multilabel_full_250_HIRID', X_val)
np.save('X_test_multilabel_full_250_HIRID', X_test)
np.save('y_train_multilabel_full_250_HIRID', y_train)
np.save('y_val_multilabel_full_250_HIRID', y_val)
np.save('y_test_multilabel_full_250_HIRID', y_test)

ML

In [4]:
cd HIRID-ICU-Benchmark/

/content/gdrive/MyDrive/DynaGraph/HIRID/HIRID-ICU-Benchmark


In [5]:
!pip install pathos==0.2.9
!pip install scikit-fda==0.5
!pip install tqdm==4.60.0

Collecting pathos==0.2.9
  Downloading pathos-0.2.9-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ppft>=1.7.6.5 (from pathos==0.2.9)
  Downloading ppft-1.7.6.8-py3-none-any.whl (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill>=0.3.5.1 (from pathos==0.2.9)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pox>=0.3.1 (from pathos==0.2.9)
  Downloading pox-0.3.4-py3-none-any.whl (29 kB)
Collecting multiprocess>=0.

In [12]:
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [19]:
!python icu_benchmarks/run.py preprocess --hirid-data-root 'Data/' \
                          --work-dir 'Preprocessed_Data' \
                          --var-ref-path ./preprocessing/resources/varref.tsv \
                          --split-path ./preprocessing/resources/split.tsv \
                          --nr-workers 8

2024-04-07 14:58:20,295 - INFO: Using extended general table in Preprocessed_Data/general_table_extended.parquet
2024-04-07 14:58:20,297 - INFO: Running merge step...
2024-04-07 14:58:20,416 - INFO: Reading general table from Preprocessed_Data/general_table_extended.parquet
2024-04-07 14:58:20,470 - INFO: start processing using 8 worker
0it [08:08, ?it/s]
multiprocess.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/content/gdrive/MyDrive/DynaGraph/HIRID/HIRID-ICU-Benchmark/icu_benchmarks/common/processing.py", line 24, in _process_parts
    df_ret = combine_fn(dfs_mapped)
  File "/content/gdrive/MyDrive/DynaGraph/HIRID/HIRID-ICU-Benchmark/icu_benchmarks/preprocessing/merge.py", line 242, in combine_obs_and_pharma_tables
    assert ((df_pid.iloc[:, 2:].notnull().sum(axis=1) == 0).sum() == 0)
AssertionError
"""

The above exception was

In [26]:
obs1 = pd.read_parquet('Data/observation_tables/parquet/part-1.parquet')

In [27]:
obs1

Unnamed: 0,datetime,entertime,patientid,status,stringvalue,type,value,variableid
0,2122-10-01 13:55:00,2122-10-01 15:12:50.710,243,8,,,165.0,10000450
1,2122-10-01 13:55:00,2122-10-01 15:12:50.710,243,8,,,80.0,10000400
2,2122-10-01 14:00:00,2122-10-01 14:59:47.780,243,8,,,0.0,30005080
3,2122-10-01 14:00:00,2122-10-01 14:59:47.770,243,8,,,0.0,30005075
4,2122-10-01 14:00:00,2122-10-01 14:59:47.590,243,8,,,0.0,30005010
...,...,...,...,...,...,...,...,...
3682928,2154-12-27 15:35:00,2154-12-27 16:18:00.140,33365,8,31,F,31.0,24000160
3682929,2154-12-27 15:35:00,2154-12-28 01:33:25.496,33365,8,31,F,31.0,24000160
3682930,2154-12-27 15:35:00,2154-12-27 16:18:00.220,33365,8,341,F,341.0,24000170
3682931,2154-12-27 15:35:00,2154-12-28 01:33:25.556,33365,8,341,F,341.0,24000170
