In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

import warnings
warnings.filterwarnings("ignore")

## General functions

In [2]:
def createTimeDataFrame(df_to_work, frecuencySample, featureToCreateTimeSteps):
    '''
    This function create a temporal dataframe
    with rows in based on the number of time step that we define. 
    '''

    expanded_df = pd.DataFrame(columns=df_to_work.columns)
    expanded_df['timeStep'] = 0

    for index, row in df_to_work.iterrows():
        admission_date = row['unitadmittime24']
        discharge_offset = int(np.ceil(row[featureToCreateTimeSteps]/frecuencySample))

        for minute in range(discharge_offset):
            new_row = row.copy()  
            new_row['timeStep'] = minute + 1
            new_row = pd.DataFrame([list(new_row.values)], columns=expanded_df.columns) 
            expanded_df = pd.concat([expanded_df, new_row], ignore_index=True)
            
    return expanded_df


def addTemporalData(df, df_temporal, offsetfeature, importantKeys):
    '''
    This function afford add new temporal data to original temporal data. 
    '''
    # Add to lab features date of started ICU
    print("# of patients:", len(df_temporal.patientunitstayid.unique()))
    data = df_temporal[['patientunitstayid', 'unitadmittime24']].drop_duplicates().reset_index(drop=True)
    df_merge = pd.merge(df, data, on='patientunitstayid', how='inner').drop_duplicates().reset_index(drop=True)
    print("# of patients:", len(df_merge.patientunitstayid.unique()))

    # For each sample, get the date of started icu + minutes from lab test/results
    df_merge['unitadmittime24'] = pd.to_datetime(df_merge['unitadmittime24'])  
    df_merge['date_event'] = df_merge.apply(lambda row: row['unitadmittime24'] + \
                                            timedelta(minutes=row[offsetfeature]), axis=1)
    # Get the timeStep for each sample in base on days from started icu + minutes from lab test/results
    timesteps = np.ceil(((df_merge['date_event'] - df_merge['unitadmittime24']).dt.days*24 + \
                         (df_merge['date_event'] - df_merge['unitadmittime24']).dt.seconds/(3600))/24)
    df_merge['timeStep'] =  timesteps
    # Select the most important features
    df_merge = df_merge[importantKeys]
    print("# of patients finally:", len(df_merge.patientunitstayid.unique()), "- Dimensiones of dataset to add:", df_merge.shape)
    # Merge init temporal dataset with new dataset (new features)
    df_temporal_f1 = pd.merge(df_temporal, df_merge, on=['patientunitstayid','timeStep'], how='left')
    print("Old dimensiones of df_temporal:", df_temporal.shape)
    print("New dimensiones of df_temporal:", df_temporal_f1.shape)
    
    return df_temporal_f1

## General parameteres

In [3]:
frecuencySample = 60*24
filterPat = True

### Step 0: Load patients to analyse

In [4]:
df_pat = pd.read_csv("../datasets/physionet-eicu-crb/patient.csv")
id_pat = df_pat.patientunitstayid.unique()
df_pat.keys()

Index(['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age',
       'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx',
       'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',
       'uniquepid'],
      dtype='object')

### step 1: Generate time dimension

In [5]:
# 'hospitaladmittime24': time in 24 hour format of the hospital admit
#     started_hosp
# 'hospitaladmitoffset': number of minutes from unit admit time that the patient was admitted to the hospital
#     timeToStarted_icu - time hospital admission to icu admission

# 'hospitaldischargetime24': time in 24 hour format of when the hospital discharge event occurred
# 'hospitaldischargeoffset': number of minutes from unit admit time that the patient was discharged from the hospital

# 'unitAdmitTime24': time in 24 hour format of when the unit admit event occurred
#     started_icu
# 'unitdischargetime24': time in 24 hour format of when the unit discharge event occurred 
#     ended_icu
# 'unitdischargeoffset': number of minutes from unit admit time that the patient was discharged from the unit
#     timeToEnded_icu
# 'hospitaldischargeyear': year of the hospital discharge date

print("Some statistics (median-minutes):")
print("\t Time in-H to in-ICU:", np.abs(df_pat.hospitaladmitoffset).median())
print("\t Time in-ICU to out-ICU:", df_pat.unitdischargeoffset.median())
timeto_inH_outICU = df_pat.unitdischargeoffset - df_pat.hospitaladmitoffset
timeto_outICU_outH = df_pat.hospitaldischargeoffset - timeto_inH_outICU
print("\t Time out-ICU to out-H:", timeto_outICU_outH.median())
print("\t Time in-H to out-H:", df_pat.hospitaldischargeoffset.median())

Some statistics (median-minutes):
	 Time in-H to in-ICU: 308.0
	 Time in-ICU to out-ICU: 2266.0
	 Time out-ICU to out-H: 2094.0
	 Time in-H to out-H: 6545.0


In [6]:
df_to_work = df_pat[['patientunitstayid','hospitaladmittime24',
        'hospitaladmitoffset', 
        'hospitaldischargetime24', 
        'hospitaldischargeoffset',
        'unitadmittime24', 
        'unitdischargetime24',
        'unitdischargeoffset',
        'hospitaldischargetime24',
        'hospitaldischargeoffset',
        'hospitaldischargeyear',
         'gender', 'age',
       ]]

print("# of patients:", df_pat.shape[0])

# of patients: 200859


In [7]:
# Filter the number of patients
df_to_work_filt = df_to_work[df_to_work.unitdischargeoffset > frecuencySample]
if filterPat:
    df_to_work_filt = df_to_work_filt[df_to_work_filt.patientunitstayid.isin(df_to_work_filt.patientunitstayid.unique()[0:100])]
print("# of patients (after filter):", df_to_work_filt.shape[0])

# Started ICU
df_to_work_filt['unitadmittime24'] = pd.to_datetime(df_to_work_filt['unitadmittime24'])
# synchronization: in-ICU to out-ICU
df_temporal = createTimeDataFrame(df_to_work_filt, frecuencySample, 'unitdischargeoffset')
print("# of patients (after filter - temporal dataframe):", len(df_temporal.patientunitstayid.unique()))
print("Dimensions:", df_temporal.shape)

# Drop static features of temporal dataset
df_temporal_f0 = df_temporal.drop(['age', 'gender'], axis=1)

# of patients (after filter): 100
# of patients (after filter - temporal dataframe): 100
Dimensions: (390, 14)


### LAB FEATURES

In [8]:
# Load new nada
df_lab = pd.read_csv("../datasets/physionet-eicu-crb/lab.csv")
if filterPat:
    df_lab = df_lab[df_lab.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]

keys = ['patientunitstayid','labname', 'labresult','labresulttext', 'labmeasurenamesystem', 'timeStep']
df_temporal_f1 = addTemporalData(df_lab, df_temporal_f0, 'labresultoffset', keys)

# of patients: 100
# of patients: 99
# of patients finally: 99 - Dimensiones of dataset to add: (21800, 6)
Old dimensiones of df_temporal: (390, 12)
New dimensiones of df_temporal: (13041, 16)


### MICRO LAB FEATURES

In [31]:
# WE DON'T HAVE INFORMATION FOR ICU PATIENTS...
df_mlab = pd.read_csv("../datasets/physionet-eicu-crb/microLab.csv")
if filterPat:
    df_mlab = df_mlab[df_mlab.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
df_mlab.shape

(0, 7)

### RESPIRATORY CARE

In [32]:
# There are problemas with ventendoffset
df_vent = pd.read_csv("../datasets/physionet-eicu-crb/respiratoryCare.csv")
df_vent.keys()

# 'ventstartoffset': number of minutes from unit admit time that the vent was started
# 'ventendoffset': number of minutes from unit admit time that the vent was ended
# 'priorVentStartTime24': time in 24 hour format of when the prior vent start event occurred
df_vent['ventendoffset'].value_counts()

0       865224
1620       157
Name: ventendoffset, dtype: int64

### DIAGNOSIS

In [33]:
df_diag = pd.read_csv("../datasets/physionet-eicu-crb/diagnosis.csv")
if filterPat:
    df_diag = df_diag[df_diag.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_diag.patientunitstayid.unique()))
print(df_diag.keys())

keys = ['patientunitstayid','diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority', 'timeStep']
df_temporal_f2 = addTemporalData(df_diag, df_temporal_f1, 'diagnosisoffset', keys)

# of patients: 54
Index(['diagnosisid', 'patientunitstayid', 'activeupondischarge',
       'diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority'],
      dtype='object')
# of patients: 100
# of patients: 54
# of patients finally: 54 - Dimensiones of dataset to add: (513, 6)
Old dimensiones of df_temporal: (13041, 16)
New dimensiones of df_temporal: (34502, 20)


In [34]:
df_temporal_f2

Unnamed: 0,patientunitstayid,hospitaladmittime24,hospitaladmitoffset,hospitaldischargetime24,hospitaldischargeoffset,unitadmittime24,unitdischargetime24,unitdischargeoffset,hospitaldischargetime24.1,hospitaldischargeoffset.1,hospitaldischargeyear,timeStep,labname,labresult,labresulttext,labmeasurenamesystem,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
0,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,72.0,cardiovascular|chest pain / ASHD|coronary arte...,"414.00, I25.10",Other
1,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,118.0,cardiovascular|ventricular disorders|cardiomyo...,,Other
2,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,72.0,pulmonary|disorders of the airways|COPD,"491.20, J44.9",Other
3,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,118.0,pulmonary|disorders of the airways|COPD,"491.20, J44.9",Other
4,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,118.0,cardiovascular|ventricular disorders|congestiv...,"428.0, I50.9",Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34497,142482,22:47:00,-8,20:15:00,8480,2024-05-08 22:55:00,17:43:00,5448,20:15:00,8480,2015,4,MCH,30.3,30.3,pg,,,,
34498,142482,22:47:00,-8,20:15:00,8480,2024-05-08 22:55:00,17:43:00,5448,20:15:00,8480,2015,4,WBC x 1000,8.5,8.5,K/mcL,,,,
34499,142482,22:47:00,-8,20:15:00,8480,2024-05-08 22:55:00,17:43:00,5448,20:15:00,8480,2015,4,MCV,95.2,95.2,fL,,,,
34500,142482,22:47:00,-8,20:15:00,8480,2024-05-08 22:55:00,17:43:00,5448,20:15:00,8480,2015,4,RDW,12.8,12.8,%,,,,


### INFUSION DRUG

In [35]:
df_drug= pd.read_csv("../datasets/physionet-eicu-crb/infusionDrug.csv")
if filterPat:
    df_drug = df_drug[df_drug.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_drug.patientunitstayid.unique()))
print(df_drug.keys())

# of patients: 0
Index(['infusiondrugid', 'patientunitstayid', 'infusionoffset', 'drugname',
       'drugrate', 'infusionrate', 'drugamount', 'volumeoffluid',
       'patientweight'],
      dtype='object')


### VITAL APERIODIC & VITAL PERIODIC

In [36]:
df_va = pd.read_csv("../datasets/physionet-eicu-crb/vitalPeriodic.csv")
if filterPat:
    df_va = df_va[df_va.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_va.patientunitstayid.unique()))
print(df_va.keys())

columnas = ['observationoffset',
       'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']

for columna in df_va.columns:
    if columna in columnas:
        nuevo_nombre = columna + '_va'
        df_va.rename(columns={columna: nuevo_nombre}, inplace=True)
        

keys = ['patientunitstayid','observationoffset_va',
       'temperature_va', 'sao2_va', 'heartrate_va', 'respiration_va', 'cvp_va', 'etco2_va', 'timeStep']
#        'systemicsystolic_va', 'systemicdiastolic_va', 'systemicmean_va', 'pasystolic_va',
#        'padiastolic_va', 'pamean_va', 'st1_va', 'st2_va', 'st3_va', 'icp_va', 'timeStep']

df_temporal_f3 = addTemporalData(df_va, df_temporal_f0, 'observationoffset_va', keys)

# of patients: 100
Index(['vitalperiodicid', 'patientunitstayid', 'observationoffset',
       'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp'],
      dtype='object')
# of patients: 100
# of patients: 100
# of patients finally: 100 - Dimensiones of dataset to add: (95319, 9)
Old dimensiones of df_temporal: (390, 12)
New dimensiones of df_temporal: (95298, 19)


In [47]:
df_temporal_f3[df_temporal_f3.patientunitstayid == 141168][keys[1:]].groupby("timeStep").median()

Unnamed: 0_level_0,observationoffset_va,temperature_va,sao2_va,heartrate_va,respiration_va,cvp_va,etco2_va
timeStep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,779.0,,92.0,126.0,,,
2,1846.5,,92.0,80.0,,31.5,
3,,,,,,,


In [48]:
df_temporal_f3[df_temporal_f3.patientunitstayid == 141168][keys[1:]]

Unnamed: 0,observationoffset_va,temperature_va,sao2_va,heartrate_va,respiration_va,cvp_va,etco2_va,timeStep
0,1289.0,,,118.0,,,,1
1,1374.0,,90.0,118.0,,,,1
2,419.0,,,132.0,,,,1
3,754.0,,,128.0,,,,1
4,1389.0,,97.0,78.0,,,,1
...,...,...,...,...,...,...,...,...
415,1624.0,,97.0,76.0,,,,2
416,2299.0,,78.0,100.0,,,,2
417,1629.0,,96.0,76.0,,,,2
418,1669.0,,,78.0,,,,2


In [None]:
total_missing = df_temporal_f3.isna().sum()
print(total_missing)

In [None]:
df_vp = pd.read_csv("../datasets/physionet-eicu-crb/vitalPeriodic.csv")
if filterPat:
    df_vp = df_vp[df_vp.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_vp.patientunitstayid.unique()))
print(df_vp.keys())

columnas = ['observationoffset',
       'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']

for columna in df_vp.columns:
    if columna in columnas:
        nuevo_nombre = columna + '_vp'
        df_vp.rename(columns={columna: nuevo_nombre}, inplace=True)
        

keys = ['patientunitstayid','observationoffset_vp',
       'temperature_vp', 'sao2_vp', 'heartrate_vp', 'respiration_vp', 'cvp_vp', 'etco2_vp',
       'systemicsystolic_vp', 'systemicdiastolic_vp', 'systemicmean_vp', 'pasystolic_vp',
       'padiastolic_vp', 'pamean_vp', 'st1_vp', 'st2_vp', 'st3_vp', 'icp_vp', 'timeStep']

df_temporal_f4 = addTemporalData(df_vp, df_temporal_f3, 'observationoffset_vp', keys)

### Missing values

In [None]:
total_missing = df_temporal_f4.isna().sum()
print(total_missing)

### Step 2: Static features
#### Apache score

In [None]:
# values for differents features at the time of the APACHE value registered
df_apachePatientResult = pd.read_csv("../datasets/physionet-eicu-crb/apacheApsVar.csv")

# Get the date of admitted icu
df_aux = df_temporal[['patientunitstayid', 'gender', 'age']]
df_aux = df_aux.drop_duplicates().reset_index(drop=True)
print("# of patients temporal data:", len(df_aux.patientunitstayid.unique()))

# Select the same patients that I've used in temporal data
df_apachePatientResult = df_apachePatientResult[df_apachePatientResult.patientunitstayid.isin(df_aux.patientunitstayid.unique())]
print("# of patients df_apachePatientResult:", len(df_apachePatientResult.patientunitstayid.unique()))

# Merge with previous dataframe
df_static = pd.merge(df_apachePatientResult, df_aux, on='patientunitstayid', how='inner').drop_duplicates().reset_index(drop=True)
print("# of patients after merge:", len(df_static.patientunitstayid.unique()))
df_static.keys()