# NB 2 - Add additional concepts from raw eICU data to parquet files (static data)

In [None]:
import os
import shutil

import pandas as pd
from pyarrow import parquet as pq

# Initital configuration

This notebook assumes that the raw eICU `.csv` files are available in the `data/raw/eICU` directory and that the cohort `.parquet` files are available in `data/cohorts/sepsis_eicu_robin`.

In [None]:
path_to_raw_eicu_data = "../data/raw/eICU"
path_to_cohorts = "../data/cohorts/sepsis_eicu_robin"

# Output path
path_to_extended_cohorts = "../data/cohorts/sepsis_eicu_extended"

# Merge stayid of cohort with patientunitstayid in raw eICU and include 'ethnicity', 'hospitalID', 'unitType', 'hospitalAdmitOffset', 'uniquepid'

In [None]:
eICU_raw_patient_data = pd.read_csv(os.path.join(path_to_raw_eicu_data, "patient.csv"))
eICU_cohort_concept_data = pq.read_table(os.path.join(path_to_cohorts, "sta.parquet")).to_pandas()

In [None]:
merged_data = pd.merge(left = eICU_cohort_concept_data, right=eICU_raw_patient_data.loc[:, ['patientunitstayid', 'ethnicity', 'hospitalid', 'unittype', 'hospitaladmitoffset', 'uniquepid']], left_on="stay_id", right_on = "patientunitstayid", how="inner")
merged_data = merged_data.drop(columns=['patientunitstayid'])

In [None]:
print (f"The shape of the raw eICU patient table is: {eICU_raw_patient_data.shape}")
print (f"The shape of the cohort concept data is: {eICU_cohort_concept_data.shape}")
print (f"The shape of the merged data is {merged_data.shape}")
merged_data

# Save merged data with extended columns to parquet file

In [None]:

if(not os.path.exists(path_to_extended_cohorts)):
    os.makedirs(path_to_extended_cohorts)
merged_data.to_parquet(os.path.join(path_to_extended_cohorts, "sta.parquet"))

# Copy unmodified cohort files to the extended cohorts folder

In [None]:
# Copy unmodified dyn.parquet to the extended cohorts folder
source_path_dyn = os.path.join(path_to_cohorts, "dyn.parquet")
destination_path_dyn = os.path.join(path_to_extended_cohorts, "dyn.parquet")
shutil.copyfile(source_path_dyn, destination_path_dyn)


In [None]:
# Copy unmodified out.parquet to the extended cohorts folder
source_path_out = os.path.join(path_to_cohorts, "out.parquet")
destination_path_out = os.path.join(path_to_extended_cohorts, "out.parquet")
shutil.copyfile(source_path_out, destination_path_out)