In [3]:
import pandas as pd
import numpy as np
from datetime import timedelta

import warnings
warnings.filterwarnings("ignore")

## General functions

In [4]:
def createTimeDataFrame(df_to_work, frecuencySample, featureToCreateTimeSteps):
    '''
    This function create a temporal dataframe
    with rows in based on the number of time step that we define. 
    '''

    expanded_df = pd.DataFrame(columns=df_to_work.columns)
    expanded_df['timeStep'] = 0

    for index, row in df_to_work.iterrows():
        admission_date = row['unitadmittime24']
        discharge_offset = int(np.ceil(row[featureToCreateTimeSteps]/frecuencySample))

        for minute in range(discharge_offset):
            new_row = row.copy()  
            new_row['timeStep'] = minute + 1
            new_row = pd.DataFrame([list(new_row.values)], columns=expanded_df.columns) 
            expanded_df = pd.concat([expanded_df, new_row], ignore_index=True)
            
    return expanded_df


def addTemporalData(df, df_temporal, offsetfeature, importantKeys):
    '''
    This function afford add new temporal data to original temporal data. 
    '''
    # Add to lab features date of started ICU
    print("# of patients:", len(df_temporal.patientunitstayid.unique()))
    data = df_temporal[['patientunitstayid', 'unitadmittime24']].drop_duplicates().reset_index(drop=True)
    df_merge = pd.merge(df, data, on='patientunitstayid', how='inner').drop_duplicates().reset_index(drop=True)
    print("# of patients:", len(df_merge.patientunitstayid.unique()))

    # For each sample, get the date of started icu + minutes from lab test/results
    df_merge['unitadmittime24'] = pd.to_datetime(df_merge['unitadmittime24'])  
    df_merge['date_event'] = df_merge.apply(lambda row: row['unitadmittime24'] + \
                                            timedelta(minutes=row[offsetfeature]), axis=1)
    # Get the timeStep for each sample in base on days from started icu + minutes from lab test/results
    timesteps = np.ceil(((df_merge['date_event'] - df_merge['unitadmittime24']).dt.days*24 + \
                         (df_merge['date_event'] - df_merge['unitadmittime24']).dt.seconds/(3600))/24)
    df_merge['timeStep'] =  timesteps
    # Select the most important features
    df_merge = df_merge[importantKeys]
    print("# of patients finally:", len(df_merge.patientunitstayid.unique()), "- Dimensiones of dataset to add:", df_merge.shape)
    # Merge init temporal dataset with new dataset (new features)
    df_temporal_f1 = pd.merge(df_temporal, df_merge, on=['patientunitstayid','timeStep'], how='left')
    print("Old dimensiones of df_temporal:", df_temporal.shape)
    print("New dimensiones of df_temporal:", df_temporal_f1.shape)
    
    return df_temporal_f1

## General parameteres

In [5]:
frecuencySample = 60*24
filterPat = True

### Step 0: Load patients to analyse

In [6]:
df_pat = pd.read_csv("../datasets/physionet-eicu-crb/patient.csv")
id_pat = df_pat.patientunitstayid.unique()
df_pat.keys()

Index(['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age',
       'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx',
       'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',
       'uniquepid'],
      dtype='object')

### step 1: Generate time dimension

In [7]:
# 'hospitaladmittime24': time in 24 hour format of the hospital admit
#     started_hosp
# 'hospitaladmitoffset': number of minutes from unit admit time that the patient was admitted to the hospital
#     timeToStarted_icu - time hospital admission to icu admission

# 'hospitaldischargetime24': time in 24 hour format of when the hospital discharge event occurred
# 'hospitaldischargeoffset': number of minutes from unit admit time that the patient was discharged from the hospital

# 'unitAdmitTime24': time in 24 hour format of when the unit admit event occurred
#     started_icu
# 'unitdischargetime24': time in 24 hour format of when the unit discharge event occurred 
#     ended_icu
# 'unitdischargeoffset': number of minutes from unit admit time that the patient was discharged from the unit
#     timeToEnded_icu
# 'hospitaldischargeyear': year of the hospital discharge date

print("Some statistics (median-minutes):")
print("\t Time in-H to in-ICU:", np.abs(df_pat.hospitaladmitoffset).median())
print("\t Time in-ICU to out-ICU:", df_pat.unitdischargeoffset.median())
timeto_inH_outICU = df_pat.unitdischargeoffset - df_pat.hospitaladmitoffset
timeto_outICU_outH = df_pat.hospitaldischargeoffset - timeto_inH_outICU
print("\t Time out-ICU to out-H:", timeto_outICU_outH.median())
print("\t Time in-H to out-H:", df_pat.hospitaldischargeoffset.median())

Some statistics (median-minutes):
	 Time in-H to in-ICU: 308.0
	 Time in-ICU to out-ICU: 2266.0
	 Time out-ICU to out-H: 2094.0
	 Time in-H to out-H: 6545.0


In [8]:
df_to_work = df_pat[['patientunitstayid','hospitaladmittime24',
        'hospitaladmitoffset', 
        'hospitaldischargetime24', 
        'hospitaldischargeoffset',
        'unitadmittime24', 
        'unitdischargetime24',
        'unitdischargeoffset',
        'hospitaldischargetime24',
        'hospitaldischargeoffset',
        'hospitaldischargeyear',
         'gender', 'age',
       ]]

print("# of patients:", df_pat.shape[0])

# of patients: 200859


In [9]:
# Filter the number of patients
df_to_work_filt = df_to_work[df_to_work.unitdischargeoffset > frecuencySample]
if filterPat:
    df_to_work_filt = df_to_work_filt[df_to_work_filt.patientunitstayid.isin(df_to_work_filt.patientunitstayid.unique()[0:10])]
print("# of patients (after filter):", df_to_work_filt.shape[0])

# Started ICU
df_to_work_filt['unitadmittime24'] = pd.to_datetime(df_to_work_filt['unitadmittime24'])
# synchronization: in-ICU to out-ICU
df_temporal = createTimeDataFrame(df_to_work_filt, frecuencySample, 'unitdischargeoffset')
print("# of patients (after filter - temporal dataframe):", len(df_temporal.patientunitstayid.unique()))
print("Dimensions:", df_temporal.shape)

# Drop static features of temporal dataset
df_temporal_f0 = df_temporal.drop(['age', 'gender'], axis=1)

# of patients (after filter): 10
# of patients (after filter - temporal dataframe): 10
Dimensions: (36, 14)


### LAB FEATURES

In [8]:
# Load new nada
df_lab = pd.read_csv("../datasets/physionet-eicu-crb/lab.csv")
if filterPat:
    df_lab = df_lab[df_lab.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]

keys = ['patientunitstayid','labname', 'labresult','labresulttext', 'labmeasurenamesystem', 'timeStep']
df_temporal_f1 = addTemporalData(df_lab, df_temporal_f0, 'labresultoffset', keys)

# of patients: 10
# of patients: 10
# of patients finally: 10 - Dimensiones of dataset to add: (2292, 6)
Old dimensiones of df_temporal: (36, 12)
New dimensiones of df_temporal: (1225, 16)


### MICRO LAB FEATURES

In [9]:
# # WE DON'T HAVE INFORMATION FOR ICU PATIENTS...
# df_mlab = pd.read_csv("../datasets/physionet-eicu-crb/microLab.csv")
# if filterPat:
#     df_mlab = df_mlab[df_mlab.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
# df_mlab.shape

### RESPIRATORY CARE

In [10]:
# # There are problemas with ventendoffset
# df_vent = pd.read_csv("../datasets/physionet-eicu-crb/respiratoryCare.csv")
# df_vent.keys()

# # 'ventstartoffset': number of minutes from unit admit time that the vent was started
# # 'ventendoffset': number of minutes from unit admit time that the vent was ended
# # 'priorVentStartTime24': time in 24 hour format of when the prior vent start event occurred
# df_vent['ventendoffset'].value_counts()

### DIAGNOSIS

In [11]:
df_diag = pd.read_csv("../datasets/physionet-eicu-crb/diagnosis.csv")
if filterPat:
    df_diag = df_diag[df_diag.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_diag.patientunitstayid.unique()))
print(df_diag.keys())

keys = ['patientunitstayid','diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority', 'timeStep']
df_temporal_f2 = addTemporalData(df_diag, df_temporal_f1, 'diagnosisoffset', keys)

# of patients: 4
Index(['diagnosisid', 'patientunitstayid', 'activeupondischarge',
       'diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority'],
      dtype='object')
# of patients: 10
# of patients: 4
# of patients finally: 4 - Dimensiones of dataset to add: (41, 6)
Old dimensiones of df_temporal: (1225, 16)
New dimensiones of df_temporal: (3593, 20)


In [12]:
df_temporal_f2

Unnamed: 0,patientunitstayid,hospitaladmittime24,hospitaladmitoffset,hospitaldischargetime24,hospitaldischargeoffset,unitadmittime24,unitdischargetime24,unitdischargeoffset,hospitaldischargetime24.1,hospitaldischargeoffset.1,hospitaldischargeyear,timeStep,labname,labresult,labresulttext,labmeasurenamesystem,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
0,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,72.0,cardiovascular|chest pain / ASHD|coronary arte...,"414.00, I25.10",Other
1,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,118.0,cardiovascular|ventricular disorders|cardiomyo...,,Other
2,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,72.0,pulmonary|disorders of the airways|COPD,"491.20, J44.9",Other
3,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,118.0,pulmonary|disorders of the airways|COPD,"491.20, J44.9",Other
4,141168,15:54:00,0,03:50:00,3596,2024-05-08 15:54:00,03:50:00,3596,03:50:00,3596,2015,1,PT - INR,2.5,2.5,ratio,118.0,cardiovascular|ventricular disorders|congestiv...,"428.0, I50.9",Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,141266,21:25:00,-18,15:31:00,11148,2024-05-08 21:43:00,22:44:00,1501,15:31:00,11148,2014,2,albumin,3.0,3.0,g/dL,,,,
3589,141266,21:25:00,-18,15:31:00,11148,2024-05-08 21:43:00,22:44:00,1501,15:31:00,11148,2014,2,platelets x 1000,163.0,163,K/mcL,,,,
3590,141266,21:25:00,-18,15:31:00,11148,2024-05-08 21:43:00,22:44:00,1501,15:31:00,11148,2014,2,MCHC,29.6,29.6,g/dL,,,,
3591,141266,21:25:00,-18,15:31:00,11148,2024-05-08 21:43:00,22:44:00,1501,15:31:00,11148,2014,2,phosphate,2.4,2.4,mg/dL,,,,


### INFUSION DRUG

In [13]:
# df_drug= pd.read_csv("../datasets/physionet-eicu-crb/infusionDrug.csv")
# if filterPat:
#     df_drug = df_drug[df_drug.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
# print("# of patients:", len(df_drug.patientunitstayid.unique()))
# print(df_drug.keys())

### VITAL APERIODIC & VITAL PERIODIC

In [None]:
import pandas as pd

id_especifico = 141168

# Especifica el tamaño del chunk que deseas leer
chunk_size = 1000  # por ejemplo, 1000 filas por chunk

# Función para filtrar y procesar cada chunk
def filter_and_process(chunk):
    chunk_filtered = chunk[chunk['patientunitstayid'] == id_especifico]

# Iterar sobre los chunks del archivo CSV y aplicar la función de filtrado y procesamiento a cada chunk
for chunk in pd.read_csv("../datasets/physionet-eicu-crb/vitalPeriodic.csv", chunksize=chunk_size):
    filter_and_process(chunk)

In [None]:
df_va = pd.read_csv("../datasets/physionet-eicu-crb/vitalPeriodic.csv",low_memory=False)
print("-")
if filterPat:
    df_va = df_va[df_va.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())[0:2]]
print("fin")

print("# of patients:", len(df_va.patientunitstayid.unique()))
print(df_va.keys())

columnas = ['observationoffset',
       'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']

for columna in df_va.columns:
    if columna in columnas:
        nuevo_nombre = columna + '_va'
        df_va.rename(columns={columna: nuevo_nombre}, inplace=True)
        
keys = ['patientunitstayid','observationoffset_va',
       'temperature_va', 'sao2_va', 'heartrate_va', 'respiration_va', 'cvp_va', 'etco2_va', 'timeStep']
#        'systemicsystolic_va', 'systemicdiastolic_va', 'systemicmean_va', 'pasystolic_va',
#        'padiastolic_va', 'pamean_va', 'st1_va', 'st2_va', 'st3_va', 'icp_va', 'timeStep']

df_temporal_f3 = addTemporalData(df_va, df_temporal_f0, 'observationoffset_va', keys)

In [None]:
df_temporal_f3[df_temporal_f3.patientunitstayid == 141168][keys[1:]].groupby("timeStep").median().reset_index()

In [None]:
df_temporal_f3[df_temporal_f3.patientunitstayid == 141168][keys[1:]]

In [None]:
total_missing = df_temporal_f3.isna().sum()
print(total_missing)

In [None]:
df_vp = pd.read_csv("../datasets/physionet-eicu-crb/vitalPeriodic.csv")
if filterPat:
    df_vp = df_vp[df_vp.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_vp.patientunitstayid.unique()))
print(df_vp.keys())

columnas = ['observationoffset',
       'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']

for columna in df_vp.columns:
    if columna in columnas:
        nuevo_nombre = columna + '_vp'
        df_vp.rename(columns={columna: nuevo_nombre}, inplace=True)
        

keys = ['patientunitstayid','observationoffset_vp',
       'temperature_vp', 'sao2_vp', 'heartrate_vp', 'respiration_vp', 'cvp_vp', 'etco2_vp',
       'systemicsystolic_vp', 'systemicdiastolic_vp', 'systemicmean_vp', 'pasystolic_vp',
       'padiastolic_vp', 'pamean_vp', 'st1_vp', 'st2_vp', 'st3_vp', 'icp_vp', 'timeStep']

df_temporal_f4 = addTemporalData(df_vp, df_temporal_f3, 'observationoffset_vp', keys)

### Missing values

In [None]:
total_missing = df_temporal_f4.isna().sum()
print(total_missing)

### Step 2: Static features
#### Apache score

In [24]:
# values for differents features at the time of the APACHE value registered
df_apachePatientResult = pd.read_csv("../datasets/physionet-eicu-crb/apacheApsVar.csv")

# Get the date of admitted icu
df_aux = df_temporal[['patientunitstayid', 'gender', 'age']]
df_aux = df_aux.drop_duplicates().reset_index(drop=True)
print("# of patients temporal data:", len(df_aux.patientunitstayid.unique()))

# Select the same patients that I've used in temporal data
df_apachePatientResult = df_apachePatientResult[df_apachePatientResult.patientunitstayid.isin(df_aux.patientunitstayid.unique())]
print("# of patients df_apachePatientResult:", len(df_apachePatientResult.patientunitstayid.unique()))

# Merge with previous dataframe
df_static = pd.merge(df_apachePatientResult, df_aux, on='patientunitstayid', how='inner').drop_duplicates().reset_index(drop=True)
print("# of patients after merge:", len(df_static.patientunitstayid.unique()))
df_static.keys()

# of patients temporal data: 10
# of patients df_apachePatientResult: 8
# of patients after merge: 8


Index(['apacheapsvarid', 'patientunitstayid', 'intubated', 'vent', 'dialysis',
       'eyes', 'motor', 'verbal', 'meds', 'urine', 'wbc', 'temperature',
       'respiratoryrate', 'sodium', 'heartrate', 'meanbp', 'ph', 'hematocrit',
       'creatinine', 'albumin', 'pao2', 'pco2', 'bun', 'glucose', 'bilirubin',
       'fio2', 'gender', 'age'],
      dtype='object')

## Rename feature

### Temporal dataset

In [16]:
drop_features = ['hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaldischargetime24', 'hospitaldischargeoffset', 'unitadmittime24',
       'unitdischargetime24', 'unitdischargeoffset', 'hospitaldischargetime24',
       'hospitaldischargeoffset', 'hospitaldischargeyear']

rename_features_fv = rename_features.drop(drop_features, axis=1) 

rename_features = {'patientunitstayid':'id', 'labname', 'labresult', 'labresulttext', 'labmeasurenamesystem',
       'diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority'}

df_temporal_f2.rename(columns=rename_features, inplace=True)

Unnamed: 0,apacheapsvarid,patientunitstayid,intubated,vent,dialysis,eyes,motor,verbal,meds,urine,...,creatinine,albumin,pao2,pco2,bun,glucose,bilirubin,fio2,gender,age
0,2,141168,0,0,0,4,6,5,0,-1.0,...,2.3,3.1,-1.0,-1.0,27.0,95.0,4.1,-1.0,Female,70
1,167885,141194,0,0,0,3,6,4,0,-1.0,...,2.51,2.3,-1.0,-1.0,31.0,168.0,0.4,-1.0,Male,68
2,2279689,141203,0,1,0,1,3,1,0,-1.0,...,0.56,-1.0,51.0,37.0,9.0,145.0,-1.0,100.0,Female,77
3,172304,141227,0,1,0,3,6,4,0,-1.0,...,1.9,-1.0,65.0,23.0,32.0,145.0,-1.0,21.0,Male,82
4,2315184,141233,1,1,0,4,6,5,0,-1.0,...,-1.0,-1.0,142.0,30.0,-1.0,185.0,-1.0,60.0,Female,81
5,2412806,141244,0,0,0,4,6,5,0,-1.0,...,0.65,-1.0,-1.0,-1.0,8.0,121.0,-1.0,-1.0,Male,59
6,4423,141265,0,0,0,4,6,5,0,-1.0,...,0.71,-1.0,-1.0,-1.0,13.0,156.0,-1.0,-1.0,Male,67
7,167887,141266,0,0,0,4,6,5,0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,215.0,-1.0,-1.0,Male,73


### Static dataset

In [25]:
drop_features = ['apacheapsvarid', 'motor', 'eyes', 'verbal']
df_static_fv = df_static.drop(drop_features, axis=1)

rename_features = {'patientunitstayid': 'id',
       'temperature':'temp',
       'respiratoryrate':'resp',
       'sodium':'na', 'heartrate':'hr',
       'creatinine':'crea', 'albumin':'alb', 
       'glucose':'glu', 'bilirubin':'bili',
       'gender':'sex'}

df_static_fv.rename(columns=rename_features, inplace=True)
df_static_fv.keys()

Index(['id', 'intubated', 'vent', 'dialysis', 'meds', 'urine', 'wbc', 'temp',
       'resp', 'na', 'hr', 'meanbp', 'ph', 'hematocrit', 'crea', 'alb', 'pao2',
       'pco2', 'bun', 'glu', 'bili', 'fio2', 'sex', 'age'],
      dtype='object')