# Lab

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
random.seed(1) 
np.random.seed(1)

### Load Data

In [2]:
file_path = r"E:\EICU\eicu-collaborative-research-database-2.0\lab.csv.gz"
lab_df = pd.read_csv(file_path, compression="gzip", low_memory=False,)

In [3]:
lab_df.head()

Unnamed: 0,labid,patientunitstayid,labresultoffset,labtypeid,labname,labresult,labresulttext,labmeasurenamesystem,labmeasurenameinterface,labresultrevisedoffset
0,52307161,141168,2026,3,fibrinogen,177.0,177.0,mg/dL,mg/dL,2219
1,50363251,141168,1133,3,PT - INR,2.5,2.5,ratio,,1208
2,49149139,141168,2026,1,magnesium,2.0,2.0,mg/dL,mg/dL,2090
3,50363250,141168,1133,3,PT,26.6,26.6,sec,sec,1208
4,66695374,141168,2141,7,pH,7.2,7.2,,Units,2155


In [4]:
len(lab_df)

39132531

In [5]:
lab_df['lab_units'] = lab_df['labmeasurenamesystem'].fillna(lab_df['labmeasurenameinterface'])
lab_df = lab_df.drop(columns=['labmeasurenamesystem', 'labmeasurenameinterface'])

In [6]:
lab_df = lab_df.drop(columns=['labid', 'labresultrevisedoffset', 'labresulttext'])

In [7]:
lab_df.isnull().sum()

patientunitstayid         0
labresultoffset           0
labtypeid                 0
labname                   0
labresult            229518
lab_units            901554
dtype: int64

In [8]:
lab_df.head()

Unnamed: 0,patientunitstayid,labresultoffset,labtypeid,labname,labresult,lab_units
0,141168,2026,3,fibrinogen,177.0,mg/dL
1,141168,1133,3,PT - INR,2.5,ratio
2,141168,2026,1,magnesium,2.0,mg/dL
3,141168,1133,3,PT,26.6,sec
4,141168,2141,7,pH,7.2,Units


In [9]:
lab_df.labname.value_counts()

labname
bedside glucose         3175835
potassium               1493261
sodium                  1393205
glucose                 1319496
Hgb                     1298708
                         ...   
HSV 1&2 IgG AB titer         12
NAPA                         10
Procainamide                 10
HIV 1&2 AB                    7
RPR titer                     3
Name: count, Length: 158, dtype: int64

In [10]:
lab_df.lab_units.value_counts()

lab_units
mg/dL       10280249
mmol/L       6662690
%            6348999
g/dL         3342580
K/mcL        2284440
              ...   
SG                 2
OD Ratio           2
dils               2
IU                 1
IV                 1
Name: count, Length: 104, dtype: int64

In [11]:
lab_df.labtypeid.value_counts()

labtypeid
1    16357492
3    15363671
7     3728154
4     3563830
2      119319
6          65
Name: count, dtype: int64

In [12]:
selected_labs = ['total bilirubin', 'Hgb', 'WBC x 1000', 'platelets x 1000','albumin', 'creatinine', 'anion gap','pH']
filtered_lab_df = lab_df[lab_df['labname'].isin(selected_labs)]


In [13]:
filtered_lab_df.head(50)

Unnamed: 0,patientunitstayid,labresultoffset,labtypeid,labname,labresult,lab_units
4,141168,2141,7,pH,7.2,Units
29,141168,1133,3,platelets x 1000,213.0,K/mcL
31,141168,2026,1,anion gap,25.0,mmol/L
48,141168,2010,7,pH,7.14,Units
51,141168,2026,7,pH,7.16,Units
52,141168,2026,3,WBC x 1000,19.8,K/mcL
53,141168,1805,7,pH,7.14,Units
54,141168,2026,1,creatinine,2.95,mg/dL
56,141168,2026,3,Hgb,11.4,g/dL
66,141168,2026,1,total bilirubin,5.2,mg/dL


In [14]:
filtered_lab_df.loc[filtered_lab_df['labname'] == 'creatinine', 'labresult'] *= 88.4
filtered_lab_df.loc[filtered_lab_df['labname'] == 'total bilirubin', 'labresult'] *= 17.1

filtered_lab_df.loc[filtered_lab_df['labname'] == 'creatinine', 'lab_units'] = 'umol/L'
filtered_lab_df.loc[filtered_lab_df['labname'] == 'total bilirubin', 'lab_units'] = 'umol/L'



In [15]:
filtered_lab_df[(filtered_lab_df['labname'] == 'creatinine') | (filtered_lab_df['labname'] == 'total bilirubin')].head()

Unnamed: 0,patientunitstayid,labresultoffset,labtypeid,labname,labresult,lab_units
54,141168,2026,1,creatinine,260.78,umol/L
66,141168,2026,1,total bilirubin,88.92,umol/L
91,141168,1133,1,creatinine,203.32,umol/L
102,141168,516,1,total bilirubin,44.46,umol/L
104,141168,1133,1,total bilirubin,70.11,umol/L


In [16]:
lab_stats = filtered_lab_df[filtered_lab_df['labname'].isin(['platelets x 1000', 'total bilirubin', 'creatinine'])] \
    .groupby('labname')['labresult'] \
    .agg(['min', 'max', 'mean', 'median', 'std', 'count'])


In [17]:
lab_stats

Unnamed: 0_level_0,min,max,mean,median,std,count
labname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
creatinine,5.304,11403.6,137.311018,88.4,144.724229,1275796
platelets x 1000,0.0,3092.0,215.752286,197.0,120.896298,1139619
total bilirubin,0.0,1692.9,26.646057,10.26,61.326035,425009


In [18]:
lab_df_wide = filtered_lab_df.copy()
lab_df_wide = lab_df_wide.drop(columns=['labresultoffset', 'labtypeid', 'lab_units'])
lab_df_wide = lab_df_wide.pivot_table(index='patientunitstayid', columns='labname', values='labresult', aggfunc='first')


lab_df_wide = lab_df_wide.reset_index()



In [20]:
lab_df_wide.head()

labname,patientunitstayid,Hgb,WBC x 1000,albumin,anion gap,creatinine,pH,platelets x 1000,total bilirubin
0,141168,11.4,19.8,3.0,25.0,260.78,7.2,213.0,88.92
1,141178,15.5,7.6,4.0,17.0,61.88,,273.0,6.84
2,141179,12.5,8.1,,9.0,61.88,,219.0,
3,141194,8.4,4.4,2.3,11.0,154.7,7.31,139.0,6.84
4,141196,10.9,14.2,2.5,11.0,73.372,7.43,504.0,5.13


In [21]:
len(lab_df_wide)

192730

In [22]:
lab_df_wide['patientunitstayid'].nunique()

192730

In [23]:
lab_df_wide.isnull().sum()

labname
patientunitstayid         0
Hgb                    3718
WBC x 1000             3570
albumin               49500
anion gap             40937
creatinine             2340
pH                   108978
platelets x 1000       6136
total bilirubin       57717
dtype: int64

In [24]:
lab_df_wide.to_csv("processed_lab.csv", index=False)