<a href="https://colab.research.google.com/github/lqh52/Early-prediction-of-sepsis/blob/main/Feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#library
import pandas as pd
import numpy as np

#load data
df = pd.read_pickle('/content/drive/MyDrive/Sepsis_predict/Pickle_input/input_added.pkl')

In [None]:
#drop Unit2 because Unit1 and Unit2 are mutually exclusive
#drop ICULOS as it's basically just an index
cols_to_drop = ['Unit2', 'ICULOS']
df = df.drop(cols_to_drop, axis=1)

#get data missing ratio
print('Data Missing Percentage:')
missing_rate = df.isna().sum()/len(df)
missing_rate = missing_rate.to_dict()
missing_rate

Data Missing Percentage:


{'AST': 0.9822784810126582,
 'Age': 0.0,
 'Alkalinephos': 0.9822784810126582,
 'BUN': 0.9367088607594937,
 'BUN/CR': 0.9367088607594937,
 'BaseExcess': 1.0,
 'Bilirubin_direct': 0.9873417721518988,
 'Bilirubin_total': 0.9822784810126582,
 'Calcium': 0.9189873417721519,
 'Chloride': 0.9873417721518988,
 'Creatinine': 0.9367088607594937,
 'DBP': 0.060759493670886074,
 'EtCO2': 0.9215189873417722,
 'FiO2': 0.9746835443037974,
 'Fibrinogen': 0.9873417721518988,
 'Gender': 0.0,
 'Glucose': 0.7189873417721518,
 'HCO3': 1.0,
 'HR': 0.06329113924050633,
 'Hct': 0.9443037974683545,
 'Hgb': 0.9443037974683545,
 'HospAdmTime': 0.0,
 'Lactate': 0.9822784810126582,
 'MAP': 0.05822784810126582,
 'Magnesium': 0.9392405063291139,
 'O2Sat': 0.08354430379746836,
 'PTT': 0.9848101265822785,
 'PaCO2': 0.9772151898734177,
 'Phosphate': 0.9544303797468354,
 'Platelets': 0.9443037974683545,
 'Potassium': 0.9164556962025316,
 'Resp': 0.18227848101265823,
 'SBP': 0.060759493670886074,
 'SIRS': 0.0,
 'SaO2': 0.

In [None]:
#Train LSTM model with "< 0.16 missing rate" continous data
to_LSTM = [stat for stat in missing_rate if missing_rate[stat] < 0.16]
print(to_LSTM)

#Just choose the continous data
to_LSTM = ['HR','O2Sat','SBP','MAP','Resp','sofa','ShockIndex','qsofa']

['HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Age', 'Gender', 'HospAdmTime', 'SepsisLabel', 'patient', 'qsofa', 'sofa', 'SIRS', 'ShockIndex', 'mews', 'sofa_det']


In [None]:
#Train masking model with ">0.16 missing rate" data
to_mask_model = [stat for stat in missing_rate if missing_rate[stat] >= 0.16]
discrete_data = ['Age','Gender','HospAdmTime','SIRS','sofa_det']
to_mask_model.extend(discrete_data)
to_mask_model

['Temp',
 'Resp',
 'EtCO2',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'SaO2',
 'AST',
 'BUN',
 'Alkalinephos',
 'Calcium',
 'Chloride',
 'Creatinine',
 'Bilirubin_direct',
 'Glucose',
 'Lactate',
 'Magnesium',
 'Phosphate',
 'Potassium',
 'Bilirubin_total',
 'TroponinI',
 'Hct',
 'Hgb',
 'PTT',
 'WBC',
 'Fibrinogen',
 'Platelets',
 'Unit1',
 'BUN/CR',
 'Age',
 'Gender',
 'HospAdmTime',
 'SIRS',
 'sofa_det']

In [None]:
df_mean_std = df.describe().loc[['mean','std']]

In [None]:
#loop through each patient at a time
save_count = 0
windowed_df_list = []
grouped_by_patient = df.groupby('patient')
for patient, group in grouped_by_patient:
    #print(patient)
    group = group.reset_index(drop=True)

    #backfill any missing values for the data with < 16% missing rate
    for stat in to_LSTM:
      group[stat] = group[stat].fillna(method='bfill').fillna(method='ffill')

    
    #standardize the data
    for stat in to_LSTM:
      group[stat] = (group[stat]-df_mean_std[stat]['mean'])/df_mean_std[stat]['std']
      group[stat] = group[stat].fillna(value=df_mean_std[stat]['mean']) #fill the rest missing data with mean

    #generate windows of 10 hours, predicting one sample into the future
    windowed_data = []
    N = len(group)
    win_len = 10
    pred_len = 1
    i = 0
    while(i+win_len+pred_len <= N):
        tmp_data = group.iloc[i:i+win_len]
        tmp_label = group.iloc[i+win_len:i+win_len+pred_len]
        tmp_label = int(any(tmp_label['SepsisLabel']))
        tmp_patient = patient
        #slide the window forward
        i = i+1

        #get all the continuous variables into one group
        X_lstm = tmp_data[to_LSTM]
        X_lstm = X_lstm.values

        #process each of the variables to be binned
        X_mask_dict = {}
        for stat in to_mask_model:
            tmp_val = tmp_data[stat].median()
            if stat not in ['Gender', 'Unit1', 'SIRS', 'sofa_det']:
                tmp_val = (tmp_val-df_mean_std[stat]['mean'])/df_mean_std[stat]['std']
                
            X_mask_dict[stat] = tmp_val
        
        #package it all into a dictionary
        tmp_dict = X_mask_dict
        tmp_dict['X_lstm'] = X_lstm
        tmp_dict['label'] = tmp_label
        tmp_dict['patient'] = tmp_patient
        windowed_data.append(tmp_dict)
        
    #append the dataframe to the list of dataframes
    windowed_data_df = pd.DataFrame(windowed_data)
    windowed_df_list.append(windowed_data_df)
    if (save_count%2000==0):
      print('Saved: ',save_count)
    
    save_count += 1

windowed_df = pd.concat(windowed_df_list).reset_index(drop=True)
train = windowed_df
train.to_pickle('/content/drive/MyDrive/Sepsis_predict/Pickle_input/input_lstm.pkl')

Saved:  0


In [None]:
train

Unnamed: 0,Temp,Resp,EtCO2,BaseExcess,HCO3,FiO2,pH,PaCO2,SaO2,AST,BUN,Alkalinephos,Calcium,Chloride,Creatinine,Bilirubin_direct,Glucose,Lactate,Magnesium,Phosphate,Potassium,Bilirubin_total,TroponinI,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets,Unit1,BUN/CR,Age,Gender,HospAdmTime,SIRS,sofa_det,X_lstm,label,patient
0,-1.665209,-4.669111,0.265261,,,-0.316228,0.073287,-1.076121,,,1.147141,,-1.842960,-0.447214,0.857163,,-0.198343,0.995440,1.403246,0.099409,1.253872,,,-0.222485,-0.237878,,0.168185,,-0.819219,0.0,0.128984,-0.021247,1.0,-1.208782,0.0,0.0,"[[-0.24102904670989783, -0.24621676008744917, ...",0,100008
1,-1.711118,-4.619310,0.226104,,,-0.316228,0.238182,-0.934526,,,1.147141,,-1.839631,-0.447214,0.857163,,-0.208918,0.814451,1.403246,0.099409,1.127874,,,-0.222485,-0.237878,,0.168185,,-0.819219,0.0,0.128984,-0.021247,1.0,-1.208782,0.0,0.0,"[[-0.24102904670989783, -0.24621676008744917, ...",0,100008
2,-1.757026,-4.619310,0.186946,,,-0.316228,0.238182,-0.934526,,,1.147141,,-1.839631,-0.447214,0.857163,,-0.219492,0.814451,1.403246,0.099409,1.127874,,,-0.222485,-0.237878,,0.168185,,-0.819219,0.0,0.128984,-0.021247,1.0,-1.208782,0.0,0.0,"[[0.049461986784116056, 0.431949837805588, 0.4...",0,100008
3,-1.757026,-4.619310,0.186946,,,-0.316228,0.238182,-0.934526,,,1.147141,,-1.827979,-0.447214,0.931398,,-0.367536,0.814451,1.403246,0.099409,1.001876,,,-0.707963,-0.707073,,-0.272299,,-1.011951,0.0,0.005723,-0.021247,1.0,-1.208782,0.0,0.0,"[[0.39805122697693274, 0.04442606758099533, -0...",0,100008
4,-1.802934,-4.619310,0.186946,,,-0.316228,0.073287,-0.792932,,,1.147141,,-1.827979,-0.447214,1.005633,,-0.536728,0.633462,1.403246,,0.938877,,,-1.193441,-1.176267,,-0.712784,,-1.204683,0.0,-0.117538,-0.021247,1.0,-1.208782,0.0,0.0,"[[0.6013949504227425, 0.23818795269329165, -1....",0,100008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,-0.747049,-4.884915,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.044748,1.0,0.555488,0.0,0.0,"[[0.6885422604709467, 0.1413070101371435, 1.31...",0,100172
281,-0.563417,-4.851714,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.044748,1.0,0.555488,0.0,0.0,"[[0.049461986784116056, 0.431949837805588, 0.1...",0,100172
282,-0.747049,-4.851714,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.044748,1.0,0.555488,0.0,0.0,"[[0.6304440537721439, 0.431949837805588, 0.279...",0,100172
283,-0.747049,-4.851714,,,,,,,,,,,,,,,-0.219492,,,,,,,,,,,,,,,-1.044748,1.0,0.555488,0.0,0.0,"[[0.16565840018172162, 0.23818795269329165, 0....",0,100172
