In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta

import warnings
warnings.filterwarnings("ignore")

In [None]:
def readTable(path):
    df = pd.read_csv(path)
    return df

def isSepsis(text):
    if 'sepsis' in text.lower():
        return 1
    else:
        return 0
    
def sepsisLabel(path, df_pat):
    df_adx = readTable(path)
    df_adx['sepsis_diag'] = df_adx['admitdxtext'].apply(isSepsis)
    df_adx = df_adx[['patientunitstayid', 'sepsis_diag']]
    df_adx = df_adx.sort_values(by=['patientunitstayid', 'sepsis_diag'], ascending=[True, False])
    df_adx = df_adx.drop_duplicates(subset='patientunitstayid', keep='first')
    df_pat = pd.merge(df_pat, df_adx, on=['patientunitstayid'], how="left").drop_duplicates().reset_index(drop=True)
    return df_pat

def processTables(df_pat, keys, offsetfeature, pathTable, debug=True):
    df = readTable(pathTable)
    if debug: print("SHAPE:", df.shape)
    # Add to lab features date of started ICU
    data = df_pat[['patientunitstayid', 'unitadmittime24']].drop_duplicates().reset_index(drop=True)
    df_merge = pd.merge(df, data, on='patientunitstayid', how='left').drop_duplicates().reset_index(drop=True)

    # For each sample, get the date of started icu + minutes from lab test/results
    df_merge['unitadmittime24'] = pd.to_datetime(df_merge['unitadmittime24'])  
    df_merge['dateEvent'] = df_merge.apply(lambda row: row['unitadmittime24'] + \
                                                timedelta(minutes=row[offsetfeature]), axis=1)
    if debug: print("FINAL SHAPE:", df.shape)
    return df_merge[keys]

In [None]:
## Temporal features
### Table: admissiondx, allergy, laboratory,  diagnosis, vitalPeriodic and vitalAperiodic
#====================== LOAD DATA ==========================#
df_pat = readTable("../datasets/physionet-eicu-crb/patient.csv")
#====================== ADMISSIONDDX ==========================#
df_pat = sepsisLabel("../datasets/physionet-eicu-crb/admissionDx.csv", df_pat)

#====================== ALLERGY ==========================#
pathTable = "../datasets/physionet-eicu-crb/allergy.csv"
offsetfeature = 'allergyoffset'
keys = ['patientunitstayid', 'allergyoffset', 'allergynotetype', 'specialtytype', 'allergyname']
df_allergy = processTables(df_pat, keys, offsetfeature, pathTable, debug)

#====================== LAB ==========================#
pathTable = "../datasets/physionet-eicu-crb/lab.csv"
offsetfeature = 'labresultoffset'
keys = ['patientunitstayid', 'labresultoffset', 'labname', 'labresult','labresulttext', 'labmeasurenamesystem']
df_lab = processTables(df_pat, keys, offsetfeature, pathTable, debug)

#====================== diagnosis ==========================#
pathTable = "../datasets/physionet-eicu-crb/diagnosis.csv"
offsetfeature = 'diagnosisoffset'
keys = ['patientunitstayid', 'diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority']
df_diag = processTables(df_pat, keys, offsetfeature, pathTable, debug)

#====================== vitalPeriodic  ==========================#
pathTable = "../datasets/physionet-eicu-crb/vitalPeriodic.csv"
offsetfeature = 'observationoffset'
keys = ['temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']
df_vp = processTables(df_pat, keys, offsetfeature, pathTable, debug)

#====================== vitalAPeriodic  ==========================#
pathTable = "../datasets/physionet-eicu-crb/vitalAperiodic.csv"
offsetfeature = 'observationoffset'
keys = ['noninvasivesystolic', 'noninvasivediastolic', 'noninvasivemean',
       'paop', 'cardiacoutput', 'cardiacinput', 'svr', 'svri', 'pvr', 'pvri']
df_va = processTables(df_pat, keys, offsetfeature, pathTable, debug)

## Test pd.Grouper

In [None]:
from datetime import datetime, timedelta
import random

fechas = []
for mes in range(1, 13):
    for _ in range(3):
        fecha = datetime(2024, mes, random.randint(1, 28))
        hora = timedelta(hours=random.randint(0, 23))
        fecha_con_hora = fecha + hora
        fechas.append(fecha_con_hora)

df = pd.DataFrame({'Fecha': fechas})
df_agrupado = df.groupby(pd.Grouper(key='Fecha', freq='1d')).count()
df_agrupado

## Static features

In [None]:
# values for differents features at the time of the APACHE value registered
df_apachePatientResult = readTable("../datasets/physionet-eicu-crb/apacheApsVar.csv")

# Get the date of admitted icu
df_aux = df_temporal[['patientunitstayid', 'gender', 'age']]
df_aux = df_aux.drop_duplicates().reset_index(drop=True)
print("# of patients temporal data:", len(df_aux.patientunitstayid.unique()))

# Select the same patients that I've used in temporal data
df_apachePatientResult = df_apachePatientResult[df_apachePatientResult.patientunitstayid.isin(df_aux.patientunitstayid.unique())]
print("# of patients df_apachePatientResult:", len(df_apachePatientResult.patientunitstayid.unique()))

# Merge with previous dataframe
df_static = pd.merge(df_apachePatientResult, df_aux, on='patientunitstayid', how='inner').drop_duplicates().reset_index(drop=True)
print("# of patients after merge:", len(df_static.patientunitstayid.unique()))

# Rename some features
drop_features = ['apacheapsvarid']
df_static_fv = df_static.drop(drop_features, axis=1)

rename_features = {'patientunitstayid': 'id',
       'temperature':'temp',
       'respiratoryrate':'resp',
       'sodium':'na', 'heartrate':'hr',
       'creatinine':'crea', 'albumin':'alb', 
       'glucose':'glu', 'bilirubin':'bili',
       'gender':'sex'}

df_static_fv.rename(columns=rename_features, inplace=True)
df_static_fv.keys()

In [None]:
def createTimeDataFrame(df_to_work, frecuencySample, featureToCreateTimeSteps):
    '''
    This function create a temporal dataframe
    with rows in based on the number of time step that we define. 
    '''

    expanded_df = pd.DataFrame(columns=df_to_work.columns)
    expanded_df['timeStep'] = 0

    for index, row in df_to_work.iterrows():
        admission_date = row['unitadmittime24']
        discharge_offset = int(np.ceil(row[featureToCreateTimeSteps]/frecuencySample))

        for minute in range(discharge_offset):
            new_row = row.copy()  
            new_row['timeStep'] = minute + 1
            new_row = pd.DataFrame([list(new_row.values)], columns=expanded_df.columns) 
            expanded_df = pd.concat([expanded_df, new_row], ignore_index=True)
            
    return expanded_df


def addTemporalData(df, df_temporal, offsetfeature, importantKeys):
    '''
    This function afford add new temporal data to original temporal data. 
    '''
    # Add to lab features date of started ICU
    print("# of patients:", len(df_temporal.patientunitstayid.unique()))
    data = df_temporal[['patientunitstayid', 'unitadmittime24']].drop_duplicates().reset_index(drop=True)
    df_merge = pd.merge(df, data, on='patientunitstayid', how='inner').drop_duplicates().reset_index(drop=True)
    print("# of patients:", len(df_merge.patientunitstayid.unique()))

    # For each sample, get the date of started icu + minutes from lab test/results
    df_merge['unitadmittime24'] = pd.to_datetime(df_merge['unitadmittime24'])  
    df_merge['date_event'] = df_merge.apply(lambda row: row['unitadmittime24'] + \
                                            timedelta(minutes=row[offsetfeature]), axis=1)
    # Get the timeStep for each sample in base on days from started icu + minutes from lab test/results
    timesteps = np.ceil(((df_merge['date_event'] - df_merge['unitadmittime24']).dt.days*24 + \
                         (df_merge['date_event'] - df_merge['unitadmittime24']).dt.seconds/(3600))/24)
    df_merge['timeStep'] =  timesteps
    # Select the most important features
    df_merge = df_merge[importantKeys]
    print("# of patients finally:", len(df_merge.patientunitstayid.unique()), "- Dimensiones of dataset to add:", df_merge.shape)
    # Merge init temporal dataset with new dataset (new features)
    df_temporal_f1 = pd.merge(df_temporal, df_merge, on=['patientunitstayid','timeStep'], how='left')
    print("Old dimensiones of df_temporal:", df_temporal.shape)
    print("New dimensiones of df_temporal:", df_temporal_f1.shape)
    
    return df_temporal_f1




In [None]:
frecuencySample = 60*24
debug = True

### step 1: Generate time dimension

In [None]:
# 'hospitaladmittime24': time in 24 hour format of the hospital admit
#     started_hosp
# 'hospitaladmitoffset': number of minutes from unit admit time that the patient was admitted to the hospital
#     timeToStarted_icu - time hospital admission to icu admission

# 'hospitaldischargetime24': time in 24 hour format of when the hospital discharge event occurred
# 'hospitaldischargeoffset': number of minutes from unit admit time that the patient was discharged from the hospital

# 'unitAdmitTime24': time in 24 hour format of when the unit admit event occurred
#     started_icu
# 'unitdischargetime24': time in 24 hour format of when the unit discharge event occurred 
#     ended_icu
# 'unitdischargeoffset': number of minutes from unit admit time that the patient was discharged from the unit
#     timeToEnded_icu
# 'hospitaldischargeyear': year of the hospital discharge date
if debug:
    print("# of patients:", df_pat.shape[0])
    print("Some statistics (median-minutes):")
    print("\t Time in-H to in-ICU:", np.abs(df_pat.hospitaladmitoffset).median())
    print("\t Time in-ICU to out-ICU:", df_pat.unitdischargeoffset.median())
    timeto_inH_outICU = df_pat.unitdischargeoffset - df_pat.hospitaladmitoffset
    timeto_outICU_outH = df_pat.hospitaldischargeoffset - timeto_inH_outICU
    print("\t Time out-ICU to out-H:", timeto_outICU_outH.median())
    print("\t Time in-H to out-H:", df_pat.hospitaldischargeoffset.median())
    
df_to_work = df_pat[['patientunitstayid','hospitaladmittime24',
        'hospitaladmitoffset', 
        'hospitaldischargetime24', 
        'hospitaldischargeoffset',
        'unitadmittime24', 
        'unitdischargetime24',
        'unitdischargeoffset',
        'hospitaldischargeyear',
         'gender', 'age', 'sepsis_diag'
       ]]

# Filter the number of patients
if filterPat:
    df_to_work_filt = df_to_work[df_to_work.unitdischargeoffset > frecuencySample]
    df_to_work = df_to_work_filt[df_to_work_filt.patientunitstayid.isin(df_to_work_filt.patientunitstayid.unique()[0:100])]
print("# of patients (after filter):", df_to_work_filt.shape[0])

# Started ICU
df_to_work['unitadmittime24'] = pd.to_datetime(df_to_work_filt['unitadmittime24'])
# synchronization: in-ICU to out-ICU
df_temporal = createTimeDataFrame(df_to_work, frecuencySample, 'unitdischargeoffset')
print("# of patients (after filter - temporal dataframe):", len(df_temporal.patientunitstayid.unique()))
print("Dimensions:", df_temporal.shape)

# Drop static features of temporal dataset
df_temporal_f0 = df_temporal.drop(['age', 'gender'], axis=1)

### Patient information

Active infection at ICU discharge: activeUponDischarge
Number of minutes from ICU admission to infection detection: cplInfectDiseaseOffset
Site of infection: infectDiseaseSite
Probability of infection: infectDiseaseAssessment

It's important but we don't have enough data. 

In [None]:
df_inf = pd.read_csv("../datasets/physionet-eicu-crb/carePlanInfectiousDisease.csv")
df_inf.shape

### LAB FEATURES

In [None]:
# Load new nada
df_lab = pd.read_csv("../datasets/physionet-eicu-crb/lab.csv")
if filterPat:
    df_lab = df_lab[df_lab.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]

keys = ['patientunitstayid','labname', 'labresult','labresulttext', 'labmeasurenamesystem', 'timeStep']
df_temporal_f1 = addTemporalData(df_lab, df_temporal_f0, 'labresultoffset', keys)
df_temporal_f1.keys()

In [None]:
df_temporal_f1

### MICRO LAB FEATURES

In [None]:
# It's important but we don't have enough data.
df_mlab = pd.read_csv("../datasets/physionet-eicu-crb/microLab.csv")
df_mlab.shape

### RESPIRATORY CARE

In [None]:
# # There are problemas with ventendoffset
# df_vent = pd.read_csv("../datasets/physionet-eicu-crb/respiratoryCare.csv")
# df_vent.keys()

# # 'ventstartoffset': number of minutes from unit admit time that the vent was started
# # 'ventendoffset': number of minutes from unit admit time that the vent was ended
# # 'priorVentStartTime24': time in 24 hour format of when the prior vent start event occurred
# df_vent['ventendoffset'].value_counts()

### DIAGNOSIS

In [None]:
df_diag = pd.read_csv("../datasets/physionet-eicu-crb/diagnosis.csv")
if filterPat:
    df_diag = df_diag[df_diag.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_diag.patientunitstayid.unique()))
print(df_diag.keys())

keys = ['patientunitstayid','diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority', 'timeStep']
df_temporal_f2 = addTemporalData(df_diag, df_temporal_f1, 'diagnosisoffset', keys)
df_temporal_f2.keys()

### INFUSION DRUG

In [None]:
# df_drug= pd.read_csv("../datasets/physionet-eicu-crb/infusionDrug.csv")
# if filterPat:
#     df_drug = df_drug[df_drug.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
# print("# of patients:", len(df_drug.patientunitstayid.unique()))
# print(df_drug.keys())

### VITAL APERIODIC & VITAL PERIODIC

In [None]:
df_va = pd.read_csv("../datasets/physionet-eicu-crb/vitalPeriodic.csv",low_memory=False)

if filterPat:
    df_va = df_va[df_va.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("fin")

print("# of patients:", len(df_va.patientunitstayid.unique()))
print(df_va.keys())

columnas = ['observationoffset',
       'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']

for columna in df_va.columns:
    if columna in columnas:
        nuevo_nombre = columna + '_va'
        df_va.rename(columns={columna: nuevo_nombre}, inplace=True)
        
keys = ['patientunitstayid','observationoffset_va',
       'temperature_va', 'sao2_va', 'heartrate_va', 'respiration_va', 'cvp_va', 'etco2_va',
       'systemicsystolic_va', 'systemicdiastolic_va', 'systemicmean_va', 'pasystolic_va',
       'padiastolic_va', 'pamean_va', 'st1_va', 'st2_va', 'st3_va', 'icp_va', 'timeStep']

df_temporal_f3 = addTemporalData(df_va, df_temporal_f2, 'observationoffset_va', keys)

keys_temp = ['patientunitstayid', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaldischargetime24', 'hospitaldischargeoffset', 'unitadmittime24',
       'unitdischargetime24', 'unitdischargeoffset', 'hospitaldischargetime24',
       'hospitaldischargeoffset', 'hospitaldischargeyear', 'timeStep']

temp_data = df_temporal_f3[keys_temp].drop_duplicates().reset_index(drop=True)
median_data = df_temporal_f3[keys].groupby(["patientunitstayid", "timeStep"]).median().reset_index()

merged_df = pd.merge(temp_data, median_data, on=['patientunitstayid', 'timeStep'])
df_temporal_f3 = merged_df.copy()

In [None]:
df_vp = pd.read_csv("../datasets/physionet-eicu-crb/vitalPeriodic.csv")
if filterPat:
    df_vp = df_vp[df_vp.patientunitstayid.isin(df_temporal_f0.patientunitstayid.unique())]
print("# of patients:", len(df_vp.patientunitstayid.unique()))
print(df_vp.keys())

columnas = ['observationoffset',
       'temperature', 'sao2', 'heartrate', 'respiration', 'cvp', 'etco2',
       'systemicsystolic', 'systemicdiastolic', 'systemicmean', 'pasystolic',
       'padiastolic', 'pamean', 'st1', 'st2', 'st3', 'icp']

for columna in df_vp.columns:
    if columna in columnas:
        nuevo_nombre = columna + '_vp'
        df_vp.rename(columns={columna: nuevo_nombre}, inplace=True)

keys = ['patientunitstayid','observationoffset_vp',
       'temperature_vp', 'sao2_vp', 'heartrate_vp', 'respiration_vp', 'cvp_vp', 'etco2_vp',
       'systemicsystolic_vp', 'systemicdiastolic_vp', 'systemicmean_vp', 'pasystolic_vp',
       'padiastolic_vp', 'pamean_vp', 'st1_vp', 'st2_vp', 'st3_vp', 'icp_vp', 'timeStep']

df_temporal_f4 = addTemporalData(df_vp, df_temporal_f3, 'observationoffset_vp', keys)

keys_temp = ['patientunitstayid', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaldischargetime24', 'hospitaldischargeoffset', 'unitadmittime24',
       'unitdischargetime24', 'unitdischargeoffset', 'hospitaldischargetime24',
       'hospitaldischargeoffset', 'hospitaldischargeyear', 'timeStep']

temp_data = df_temporal_f4[keys_temp].drop_duplicates().reset_index(drop=True)
keys = list(df_temporal_f4.keys())[15:]
keys.insert(0, 'patientunitstayid')
keys.insert(0, 'timeStep')
median_data = df_temporal_f4[keys].groupby(["patientunitstayid", "timeStep"]).median().reset_index()

merged_df = pd.merge(temp_data, median_data, on=['patientunitstayid', 'timeStep'])
df_temporal_f4 = merged_df.copy()

In [None]:
df_temporal_f4.keys()

### Missing values

In [None]:
total_missing = df_temporal_f4.isna().sum()
print(total_missing)

### Step 2: Static features
#### Apache score

In [None]:
# values for differents features at the time of the APACHE value registered
df_apachePatientResult = pd.read_csv("../datasets/physionet-eicu-crb/apacheApsVar.csv")

# Get the date of admitted icu
df_aux = df_temporal[['patientunitstayid', 'gender', 'age']]
df_aux = df_aux.drop_duplicates().reset_index(drop=True)
print("# of patients temporal data:", len(df_aux.patientunitstayid.unique()))

# Select the same patients that I've used in temporal data
df_apachePatientResult = df_apachePatientResult[df_apachePatientResult.patientunitstayid.isin(df_aux.patientunitstayid.unique())]
print("# of patients df_apachePatientResult:", len(df_apachePatientResult.patientunitstayid.unique()))

# Merge with previous dataframe
df_static = pd.merge(df_apachePatientResult, df_aux, on='patientunitstayid', how='inner').drop_duplicates().reset_index(drop=True)
print("# of patients after merge:", len(df_static.patientunitstayid.unique()))
df_static.keys()

## Rename feature

### Temporal dataset

In [None]:
# drop_features = ['hospitaladmittime24', 'hospitaladmitoffset',
#        'hospitaldischargetime24', 'hospitaldischargeoffset', 'unitadmittime24',
#        'unitdischargetime24', 'unitdischargeoffset', 'hospitaldischargetime24',
#        'hospitaldischargeoffset', 'hospitaldischargeyear']

# rename_features_fv = rename_features.drop(drop_features, axis=1) 

# rename_features = {'patientunitstayid':'id', ' 'labresult', 'labresulttext', 'labmeasurenamesystem',
#        'diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority'}

# df_temporal_f2.rename(columns=rename_features, inplace=True)

### Static dataset

In [None]:
drop_features = ['apacheapsvarid']
df_static_fv = df_static.drop(drop_features, axis=1)

rename_features = {'patientunitstayid': 'id',
       'temperature':'temp',
       'respiratoryrate':'resp',
       'sodium':'na', 'heartrate':'hr',
       'creatinine':'crea', 'albumin':'alb', 
       'glucose':'glu', 'bilirubin':'bili',
       'gender':'sex'}

df_static_fv.rename(columns=rename_features, inplace=True)
df_static_fv.keys()