<a href="https://colab.research.google.com/github/lqh52/Early-prediction-of-sepsis/blob/main/Feature_engineering_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#library
import pandas as pd
import numpy as np

#load data
df = pd.read_pickle('/content/drive/MyDrive/Sepsis_predict/data.pkl')

In [None]:
#drop Unit2 because Unit1 and Unit2 are mutually exclusive
#drop ICULOS as it's basically just an index
cols_to_drop = ['Unit2', 'ICULOS']
df = df.drop(cols_to_drop, axis=1)

#get data missing ratio
print('Data Missing Percentage:')
missing_rate = df.isna().sum()/len(df)
missing_rate = missing_rate.to_dict()
missing_rate

Data Missing Percentage:


{'AST': 0.9837782640972004,
 'Age': 0.0,
 'Alkalinephos': 0.9839340751743378,
 'BUN': 0.9313484804878819,
 'BUN/CR': 0.9392896946424811,
 'BaseExcess': 0.945778389003601,
 'Bilirubin_direct': 0.9980748961956987,
 'Bilirubin_total': 0.985094288236328,
 'Calcium': 0.9411748799063588,
 'Chloride': 0.9545933041155371,
 'Creatinine': 0.9390463203153823,
 'DBP': 0.3136393282611033,
 'EtCO2': 0.9628912097442445,
 'FiO2': 0.9166481560531637,
 'Fibrinogen': 0.993404426966133,
 'Gender': 0.0,
 'Glucose': 0.8289741643343294,
 'HCO3': 0.9580945464191459,
 'HR': 0.09881963451357005,
 'Hct': 0.9114420057650099,
 'Hgb': 0.9261674402493493,
 'HospAdmTime': 5.150779409501771e-06,
 'Lactate': 0.9733060857102571,
 'MAP': 0.12450077680191969,
 'Magnesium': 0.9368971576067676,
 'O2Sat': 0.1306153958084245,
 'PTT': 0.9705465556416165,
 'PaCO2': 0.9443908978001665,
 'Phosphate': 0.9598651268411622,
 'Platelets': 0.9405941295279375,
 'Potassium': 0.9068880729195841,
 'Resp': 0.15352091184247887,
 'SBP': 0.145

In [None]:
#Train LSTM model with "< 0.16 missing rate" continous data
to_LSTM = [stat for stat in missing_rate if missing_rate[stat] < 0.16]
print(to_LSTM)

#Just choose the continous data
to_LSTM = ['HR','O2Sat','SBP','MAP','Resp','sofa','ShockIndex','qsofa']

['HR', 'O2Sat', 'SBP', 'MAP', 'Resp', 'Age', 'Gender', 'HospAdmTime', 'SepsisLabel', 'patient', 'qsofa', 'sofa', 'SIRS', 'ShockIndex', 'sofa_det']


In [None]:
#Train masking model with ">0.16 missing rate" data
to_mask_model = [stat for stat in missing_rate if missing_rate[stat] >= 0.16]
discrete_data = ['Age','Gender','HospAdmTime','SIRS','sofa_det']
to_mask_model.extend(discrete_data)
to_mask_model

['Temp',
 'DBP',
 'EtCO2',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'SaO2',
 'AST',
 'BUN',
 'Alkalinephos',
 'Calcium',
 'Chloride',
 'Creatinine',
 'Bilirubin_direct',
 'Glucose',
 'Lactate',
 'Magnesium',
 'Phosphate',
 'Potassium',
 'Bilirubin_total',
 'TroponinI',
 'Hct',
 'Hgb',
 'PTT',
 'WBC',
 'Fibrinogen',
 'Platelets',
 'Unit1',
 'BUN/CR',
 'Age',
 'Gender',
 'HospAdmTime',
 'SIRS',
 'sofa_det']

In [None]:
#Random pick 8000 patient to test set, calculate mean and standard deviation for standardization
patients_training_data = df['patient'].unique()
np.random.shuffle(patients_training_data)
patients_training_data = patients_training_data[0:-8000]

df_mean_std = df[df['patient'].isin(patients_training_data)].describe().loc[['mean', 'std']]
type(df_mean_std.values)

numpy.ndarray

In [None]:
#loop through each patient at a time
save_count = 0
windowed_df_list = []
grouped_by_patient = df.groupby('patient')
for patient, group in grouped_by_patient:
    #print(patient)
    group = group.reset_index(drop=True)

    #backfill any missing values for the data with < 16% missing rate
    for stat in to_LSTM:
      group[stat] = group[stat].fillna(method='bfill').fillna(method='ffill')

    
    #standardize the data
    for stat in to_LSTM:
      group[stat] = (group[stat]-df_mean_std[stat]['mean'])/df_mean_std[stat]['std']
      group[stat] = group[stat].fillna(value=df_mean_std[stat]['mean']) #fill the rest missing data with mean

    #generate windows of 10 hours, predicting one sample into the future
    windowed_data = []
    N = len(group)
    win_len = 10
    pred_len = 1
    i = 0
    while(i+win_len+pred_len <= N):
        tmp_data = group.iloc[i:i+win_len]
        tmp_label = group.iloc[i+win_len:i+win_len+pred_len]
        tmp_label = int(any(tmp_label['SepsisLabel']))
        tmp_patient = patient
        #slide the window forward
        i = i+1

        #get all the continuous variables into one group
        X_lstm = tmp_data[to_LSTM]
        X_lstm = X_lstm.values

        #process each of the variables to be binned
        X_mask_dict = {}
        for stat in to_mask_model:
            tmp_val = tmp_data[stat].median()
            if stat not in ['Gender', 'Unit1', 'SIRS', 'sofa_det']:
                tmp_val = (tmp_val-df_mean_std[stat]['mean'])/df_mean_std[stat]['std']
                
            X_mask_dict[stat] = tmp_val
        
        #package it all into a dictionary
        tmp_dict = X_mask_dict
        tmp_dict['X_lstm'] = X_lstm
        tmp_dict['label'] = tmp_label
        tmp_dict['patient'] = tmp_patient
        windowed_data.append(tmp_dict)
        
    #append the dataframe to the list of dataframes
    windowed_data_df = pd.DataFrame(windowed_data)
    windowed_df_list.append(windowed_data_df)
    if (save_count%2000==0):
      print('Saved: ',save_count)
    
    save_count += 1

windowed_df = pd.concat(windowed_df_list).reset_index(drop=True)
train = windowed_df[windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)
test = windowed_df[~windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)
train.to_pickle('train.pkl')
test.to_pickle('test.pkl')


Saved:  0
Saved:  2000
Saved:  4000
Saved:  6000
Saved:  8000
Saved:  10000
Saved:  12000
Saved:  14000
Saved:  16000
Saved:  18000
Saved:  20000
Saved:  22000
Saved:  24000
Saved:  26000
Saved:  28000
Saved:  30000
Saved:  32000
Saved:  34000
Saved:  36000
Saved:  38000
Saved:  40000


In [None]:
train

Unnamed: 0,Temp,DBP,EtCO2,BaseExcess,HCO3,FiO2,pH,PaCO2,SaO2,AST,BUN,Alkalinephos,Calcium,Chloride,Creatinine,Bilirubin_direct,Glucose,Lactate,Magnesium,Phosphate,Potassium,Bilirubin_total,TroponinI,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets,Unit1,BUN/CR,Age,Gender,HospAdmTime,SIRS,sofa_det,X_lstm,label
0,-1.124568,,,5.776292,,-0.022663,-0.255657,6.387048,,,,,,,,,,,,,,,,,,,,,,,,1.29122,0.0,0.376373,1.0,0.0,"[[0.7131288338642755, -0.7463472113735677, -1....",0
1,-0.436586,,,5.659349,,-0.022663,0.013438,5.629547,-0.41779,,,,,,,,,,,,,,,,,,,,,,,1.29122,0.0,0.376373,1.0,0.0,"[[0.7131288338642755, -0.7463472113735677, -1....",0
2,-0.436586,,,5.659349,,-0.022663,0.013438,5.629547,-0.41779,,,,,,,,,,,,,,,,,,,,,,,1.29122,0.0,0.376373,1.0,0.0,"[[0.25234028422231, 0.615055061604524, -0.0740...",0
3,-0.436586,,,5.659349,,-0.022663,0.013438,5.629547,-0.41779,,,,,,,,,,,,,,,,,,,,,,,1.29122,0.0,0.376373,1.0,0.0,"[[0.3099388529275557, -0.7463472113735677, -0....",0
4,-0.436586,,,5.542407,,-0.022663,0.282533,4.872046,-0.41779,,,,,,,,,,,,,,,,,,,,,,,1.29122,0.0,0.376373,1.0,0.0,"[[1.0587202460957497, -2.958625904962967, -0.0...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150490,-0.488509,0.228372,,,,,,,,,,,,,,,0.840298,,,,,,,,,,,,,,,0.00040,0.0,0.376579,0.0,0.0,"[[-0.1508496967144099, 0.615055061604524, 0.31...",0
1150491,-0.488509,0.120736,,,,,,,,,,,,,,,0.840298,,,,,,,,,,,,,,,0.00040,0.0,0.376579,0.0,0.0,"[[-0.3812439715353927, 0.615055061604524, 0.91...",0
1150492,-0.618317,0.120736,,,,,,,,,,,,,,,0.859784,,,,,,,,,,,,,,,0.00040,0.0,0.376579,0.0,0.0,"[[-0.3812439715353927, 0.615055061604524, 0.31...",0
1150493,-0.618317,0.120736,,,,,,,,,,,,,,,0.859784,,,,,,,,,,,,,,,0.00040,0.0,0.376579,0.0,0.0,"[[-0.49644110894588406, 0.274704493360001, 0.2...",0
