In [1]:
import pandas as pd

eicu = pd.read_csv("eicu_dataset.csv")
eicu.head()

Unnamed: 0,patientunitstayid,hospitalid,death,acetamin,biotene,compazine,ferrous,imdur,lidocaine,milk of magnesia,...,sex_is_male,sex_is_female,< 30,30 - 39,40 - 49,50 - 59,60 - 69,70 - 79,80 - 89,> 89
0,141168.0,59.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,141178.0,60.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,141179.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,141194.0,73.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,141196.0,67.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [2]:
lab = pd.read_csv("data/eicu/lab.csv")

In [3]:
lab.head()

Unnamed: 0,labid,patientunitstayid,labresultoffset,labtypeid,labname,labresult,labresulttext,labmeasurenamesystem,labmeasurenameinterface,labresultrevisedoffset
0,52307161,141168,2026,3,fibrinogen,177.0,177.0,mg/dL,mg/dL,2219
1,50363251,141168,1133,3,PT - INR,2.5,2.5,ratio,,1208
2,49149139,141168,2026,1,magnesium,2.0,2.0,mg/dL,mg/dL,2090
3,50363250,141168,1133,3,PT,26.6,26.6,sec,sec,1208
4,66695374,141168,2141,7,pH,7.2,7.2,,Units,2155


In [4]:
# key: lab_name, value: lab_name in eicu
lab_name_mapping = {
    "o2sat": "O2 Sat (%)",
    "pao2": "paO2",
    "paco2": "paCO2",
    "ph": "pH",
    "albu_lab": "albumin",
    "bands": "-bands",
    "bun": "BUN",
    "hct": "Hct",
    "inr": "PT - INR",
    "lactate": "lactate",
    "platelets": "platelets x 1000",
    "wbc": "WBC x 1000"
}

In [5]:
lab_name_mapping_values = list(lab_name_mapping.values())
unique_lab_names = lab['labname'].unique()
for value in lab_name_mapping_values:
    print(value, value in unique_lab_names)
    print()

O2 Sat (%) True

paO2 True

paCO2 True

pH True

albumin True

-bands True

BUN True

Hct True

PT - INR True

lactate True

platelets x 1000 True

WBC x 1000 True


In [6]:
selected_lab = lab[lab['labname'].isin(lab_name_mapping_values)].copy()

for value in lab_name_mapping_values:

    lab_values = selected_lab[selected_lab['labname'] == value]['labresult']
    
    # Filter out outliers using IQR
    Q1 = lab_values.quantile(0.25)
    Q3 = lab_values.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    lab_values_adjusted = lab_values.clip(lower_bound, upper_bound)
    
    mean_value = lab_values_adjusted.mean()
    std = lab_values_adjusted.std()
    
    # Normalize lab values using standard normal distribution
    lab_values_normalized = (lab_values_adjusted - mean_value) / std
    
    selected_lab.loc[selected_lab['labname'] == value, 'labresult'] = lab_values_normalized

In [7]:
eicu.columns

Index(['patientunitstayid', 'hospitalid', 'death', 'acetamin', 'biotene',
       'compazine', 'ferrous', 'imdur', 'lidocaine', 'milk of magnesia',
       ...
       'sex_is_male', 'sex_is_female', '< 30', '30 - 39', '40 - 49', '50 - 59',
       '60 - 69', '70 - 79', '80 - 89', '> 89'],
      dtype='object', length=259)

In [8]:
eicu_columns = list(eicu.columns) + list(lab_name_mapping.keys())

In [9]:
lab

Unnamed: 0,labid,patientunitstayid,labresultoffset,labtypeid,labname,labresult,labresulttext,labmeasurenamesystem,labmeasurenameinterface,labresultrevisedoffset
0,52307161,141168,2026,3,fibrinogen,177.00,177,mg/dL,mg/dL,2219
1,50363251,141168,1133,3,PT - INR,2.50,2.5,ratio,,1208
2,49149139,141168,2026,1,magnesium,2.00,2.0,mg/dL,mg/dL,2090
3,50363250,141168,1133,3,PT,26.60,26.6,sec,sec,1208
4,66695374,141168,2141,7,pH,7.20,7.20,,Units,2155
...,...,...,...,...,...,...,...,...,...,...
39132526,824772675,3353263,-7,3,WBC x 1000,6.40,6.4,K/mcL,K/uL,6
39132527,826470517,3353263,1733,3,RBC,4.67,4.67,M/mcL,M/uL,1774
39132528,824772678,3353263,-7,3,-monos,10.00,10,%,%,6
39132529,826470516,3353263,1733,3,WBC x 1000,6.60,6.6,K/mcL,K/uL,1774


In [10]:
from tqdm import tqdm

eicu_lab = pd.DataFrame(columns=eicu_columns)

bin_time = 12 * 60

for index, row in tqdm(eicu.iterrows(), total=eicu.shape[0]):
    
    eicu_row = pd.DataFrame(0.0, index=[0], columns=eicu_columns)
    
    # copy row values to eicu_row
    for column in eicu.columns:
        eicu_row[column] = row[column]
    
    # get patient lab data
    patient_id = row['patientunitstayid']
    lab_patient = selected_lab[selected_lab['patientunitstayid'] == patient_id]
    
    for lab_name, eicu_lab_name in lab_name_mapping.items():
        lab_value = lab_patient[lab_patient['labname'] == eicu_lab_name]['labresult']
        
        # average lab values
        if lab_value.shape[0] > 0:
            eicu_row[lab_name] = lab_value.mean()   
        
    eicu_lab = pd.concat([eicu_lab, eicu_row], ignore_index=True)

eicu_lab.head()

  eicu_lab = pd.concat([eicu_lab, eicu_row], ignore_index=True)
  0%|          | 10/200859 [00:00<1:20:22, 41.65it/s]


Unnamed: 0,patientunitstayid,hospitalid,death,acetamin,biotene,compazine,ferrous,imdur,lidocaine,milk of magnesia,...,paco2,ph,albu_lab,bands,bun,hct,inr,lactate,platelets,wbc
0,141168.0,59.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.274784,-2.192923,0.483001,0.0,0.019298,1.176835,1.399485,2.210935,-0.079049,0.703849
1,141178.0,60.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.680403,0.0,-0.872351,1.883082,0.0,0.0,0.58375,-0.667376
2,141179.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.472018,0.746946,0.0,0.0,0.072448,-0.571709
3,141194.0,73.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.483424,-0.620623,-0.184781,0.0,0.020598,-0.809869,-0.764793,-0.534791,-0.197406,-0.60615
4,141196.0,67.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.217625,0.637216,-0.392024,0.0,-0.381034,0.265879,0.0,-0.953986,2.392245,1.112027


In [ ]:
eicu_lab.to_csv("eicu_drug_lab.csv", index=False)