In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
df = pd.read_csv('table.csv', low_memory = False).drop(['sepsis_time','intime','outtime', 'gender'], axis = 1)
print(len(df))
df = df.groupby('icustay_id').filter(lambda x: x['hr'].count() >= 12) #only icustays with more then 12 records
print(len(df))

In [None]:
df.groupby('icustay_id').sepsis_onset.max().value_counts()

In [None]:
df.columns

In [None]:
## check if we have an observation for hr = hr_sepsis + 4

lacks_last_obs = (df[df.sepsis_onset == 1].groupby('icustay_id').hr.max() != df[df.sepsis_onset == 1].groupby('icustay_id').sepsis4_or_discharge_hr.max())
max_obs = df[df.sepsis_onset == 1].groupby('icustay_id').sepsis4_or_discharge_hr.max()[lacks_last_obs].reset_index().rename({'sepsis4_or_discharge_hr':'hr'}, axis = 1)
df_ = pd.DataFrame(columns = df.columns).astype(df.dtypes)
df_.loc[:,]
df = pd.concat([max_obs, df_, df]).set_index(['icustay_id', 'hr']).sort_index()
df[['admission_age','sepsis_onset','sepsis4_or_discharge_hr']] = df[['admission_age','sepsis_onset','sepsis4_or_discharge_hr']].ffill()
df.head()

In [None]:
icustays_with_notes = df.groupby(level = 'icustay_id').ckbe_embedding.count() > 0
icustays_with_notes = icustays_with_notes[icustays_with_notes].index
icustays_with_notes

In [16]:
df = df[df.index.get_level_values('icustay_id').isin(icustays_with_notes)]

In [None]:
df['sepsis_in_4_hours'] = (df.index.get_level_values(level = 1) == df['sepsis4_or_discharge_hr'])
df['sepsis_in_4_hours'] = df['sepsis_in_4_hours'].map(int)


df.drop(['sepsis_onset','sepsis4_or_discharge_hr'], axis = 1)

In [None]:
clinical_kb_bert = pd.DataFrame(list(df.ckbe_embedding.dropna().apply(lambda x: x.strip('[]').split(','))), columns = [str(i)+'_clinical_kb_bert' for i in range(768)],
    index = df.ckbe_embedding.dropna().index).reindex(df.index)
clinical_bert = pd.DataFrame(list(df.cbe_embedding.dropna().apply(lambda x: x.strip('[]').split(','))), columns = [str(i)+'_clinical_bert' for i in range(768)],
    index = df.cbe_embedding.dropna().index).reindex(df.index)
df = pd.concat([df, clinical_kb_bert, clinical_bert], axis = 1)
del clinical_kb_bert, clinical_bert
df = df.drop(['ckbe_embedding', 'cbe_embedding'], axis = 1)
df

In [19]:
is_septic = df.groupby('icustay_id').sepsis_onset.max()
septic_icu_ids = is_septic[is_septic == 1].index

In [20]:
icu_uniques = df.index.get_level_values(level = 'icustay_id').unique()

In [None]:
if not os.path.exists('inputs'):
    os.mkdir('inputs')

icu_uniques = df.index.get_level_values(level = 'icustay_id').unique()

for icu in tqdm(icu_uniques):

    df.loc[(icu,)].to_csv('inputs/%i.csv'%icu)

In [21]:
if not os.path.exists('ids'):
    os.mkdir('ids')

columns = ['heartrate', 'spo2', 'tempc', 'sysbp', 'meanbp', 'diasbp', 'resprate',
       'baseexcess', 'bicarbonate', 'fio2', 'pf', 'pco2', 'so2', 'bun',
       'calcium', 'chloride', 'creatinine', 'glucose', 'lactate', 'potassium',
       'bilirubin_total', 'hct', 'hgb', 'ptt', 'wbc', 'platelets', 'admission_age']

train_idx, test_idx = train_test_split(is_septic, train_size=0.8, random_state=5521, stratify=is_septic)
train_idx = train_idx.index
test_idx = test_idx.index

with open('ids/train_ids.txt', 'w') as fp:
    for id in train_idx:
        fp.write(str(id)+'\n')

with open('ids/test_ids.txt', 'w') as fp:
    for id in test_idx:
        fp.write(str(id)+'\n')

with open('ids/is_septic.txt', 'w') as fp:
    for id in is_septic.keys():
        fp.write(str(id)+ ' ' +  str(is_septic[id]) +'\n')

In [22]:
n_folds = 8
folds = np.empty(n_folds, dtype = object)
for i_split, idx in enumerate(StratifiedKFold(n_folds, shuffle=True, random_state=5521).split(X = is_septic[train_idx].index, y = is_septic[train_idx])):
    folds[i_split] = (np.array(is_septic.index[idx[0]]), np.array(is_septic.index[idx[1]]))

In [23]:
stats_per_fold = {fold: {'mean': df[columns][df.index.get_level_values('icustay_id').isin(folds[fold][0])].mean(),
                         'std': df[columns][df.index.get_level_values('icustay_id').isin(folds[fold][0])].std()} for fold in range(n_folds)}
stats_per_fold[-1] = {'mean': df[columns][df.index.get_level_values('icustay_id').isin(train_idx)].mean(),
                      'std': df[columns][df.index.get_level_values('icustay_id').isin(train_idx)].std()}

In [None]:
for i in tqdm(range(n_folds)):
    #train
    fold_train_idx = folds[i][0]
    fold_test_idx = folds[i][1]
    with open('ids/fold_%i_train.txt'%i, 'w') as fp:
        for id in fold_train_idx:
            fp.write(str(id)+'\n')
    with open('ids/fold_%i_test.txt'%i, 'w') as fp:
        for id in fold_test_idx:
            fp.write(str(id)+'\n')

    stats_per_fold[i]['mean'].to_csv('ids/fold_%i_means.csv'%i, header = False)
    stats_per_fold[i]['std'].to_csv('ids/fold_%i_stds.csv'%i, header = False)

stats_per_fold[-1]['mean'].to_csv('ids/train_means.csv', header = False)
stats_per_fold[-1]['std'].to_csv('ids/train_stds.csv', header = False)

In [None]:
clinical_bert_columns = df.columns[df.columns.str.contains('_clinical_bert')]
clinical_kb_bert_columns = df.columns[df.columns.str.contains('_clinical_kb_bert')]