In [1]:
import pandas as pd
from loguru import logger
from pathlib import Path

# Set pandas columns width to 100
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 100)

In [3]:
# Path to all processed data
PROCESS_DIR = "../data/processed/hosp"

all_processed_csv = list(Path(PROCESS_DIR).glob("*.csv"))
all_processed_csv

[PosixPath('../data/processed/hosp/diagnosis_feat_df.csv'),
 PosixPath('../data/processed/hosp/labs_feature_df.csv'),
 PosixPath('../data/processed/hosp/prescriptions_feat_df.csv'),
 PosixPath('../data/processed/hosp/procedures_feat_df.csv'),
 PosixPath('../data/processed/hosp/temporal_labs_feature_df.csv'),
 PosixPath('../data/processed/hosp/time_to_death_df.csv')]

In [4]:
# Read all processed data
all_processed_data = {}
for file in all_processed_csv:
    # Read the CSV file
    df = pd.read_csv(file)
    
    # Get the name of the file without the extension
    file_name = file.stem
    
    # Store the DataFrame in the dictionary
    all_processed_data[file_name] = df
logger.info(f"Read {len(all_processed_data)} processed data files.")

[32m2025-04-21 19:15:32.638[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mRead 6 processed data files.[0m


In [5]:
# Observe all processed data for a given subject id
subject_id = 10001884

# Filter the data for the given subject id
filtered_data = {}
for file_name, df in all_processed_data.items():
    # Filter the DataFrame for the given subject id
    filtered_df = df[df['subject_id'] == subject_id]
    
    # Store the filtered DataFrame in the dictionary
    filtered_data[file_name] = filtered_df
logger.info(f"Filtered data for subject id {subject_id}.")

[32m2025-04-21 19:15:37.093[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mFiltered data for subject id 10001884.[0m


In [8]:
# View all filtered data
for file_name, df in filtered_data.items():
    print(f"\nData from {file_name}:")
    # display(df.head())
    display(df.columns.to_list()) 


Data from diagnosis_feat_df:


['subject_id',
 'hadm_id',
 'count_prior_admissions',
 'count_unique_diagnoses_prior',
 'avg_diagnoses_per_prior_admission',
 'time_since_last_admission_days',
 'admission_frequency_last_year',
 'flag_history_CHF_ICD_CODES',
 'flag_history_DIABETES_ICD_CODES',
 'flag_history_CKD_ICD_CODES',
 'flag_history_CANCER_ICD_CODES',
 'flag_history_COPD_ICD_CODES',
 'flag_history_LIVER_DISEASE_ICD_CODES',
 'flag_history_MI_ICD_CODES',
 'flag_history_STROKE_ICD_CODES',
 'flag_history_SEPSIS_ICD_CODES',
 'flag_history_AKI_ICD_CODES',
 'count_prior_admissions_with_CHF_ICD_CODES',
 'time_since_first_diagnosis_CHF_ICD_CODES_years',
 'count_prior_admissions_with_DIABETES_ICD_CODES',
 'time_since_first_diagnosis_DIABETES_ICD_CODES_years',
 'count_prior_admissions_with_CKD_ICD_CODES',
 'time_since_first_diagnosis_CKD_ICD_CODES_years',
 'count_prior_admissions_with_CANCER_ICD_CODES',
 'time_since_first_diagnosis_CANCER_ICD_CODES_years',
 'count_prior_admissions_with_COPD_ICD_CODES',
 'time_since_first_di


Data from labs_feature_df:


['subject_id',
 'count_prior_labevents',
 'count_unique_labs_tested_prior',
 'albumin_prior_avg',
 'albumin_prior_min',
 'albumin_prior_max',
 'albumin_prior_std',
 'bicarbonate_prior_avg',
 'bicarbonate_prior_min',
 'bicarbonate_prior_max',
 'bicarbonate_prior_std',
 'creatinine_prior_avg',
 'creatinine_prior_min',
 'creatinine_prior_max',
 'creatinine_prior_std',
 'potassium_prior_avg',
 'potassium_prior_min',
 'potassium_prior_max',
 'potassium_prior_std',
 'sodium_prior_avg',
 'sodium_prior_min',
 'sodium_prior_max',
 'sodium_prior_std',
 'urea nitrogen_prior_avg',
 'urea nitrogen_prior_min',
 'urea nitrogen_prior_max',
 'urea nitrogen_prior_std',
 'hematocrit_prior_avg',
 'hematocrit_prior_min',
 'hematocrit_prior_max',
 'hematocrit_prior_std',
 'hemoglobin_prior_avg',
 'hemoglobin_prior_min',
 'hemoglobin_prior_max',
 'hemoglobin_prior_std',
 'glucose_prior_avg',
 'glucose_prior_min',
 'glucose_prior_max',
 'glucose_prior_std',
 'wbc_prior_avg',
 'wbc_prior_min',
 'wbc_prior_max'


Data from prescriptions_feat_df:


['subject_id',
 'count_prior_prescriptions',
 'count_unique_drugs_prior',
 'avg_drugs_per_prior_admission',
 'flag_history_on_insulin',
 'count_prior_admissions_on_insulin',
 'flag_history_on_diuretics',
 'count_prior_admissions_on_diuretics',
 'flag_history_on_anticoagulants',
 'count_prior_admissions_on_anticoagulants',
 'flag_history_on_steroids',
 'count_prior_admissions_on_steroids',
 'flag_history_on_chemotherapy',
 'count_prior_admissions_on_chemotherapy',
 'flag_on_steroids_last_prior_admission']


Data from procedures_feat_df:


['subject_id',
 'hadm_id',
 'count_prior_procedures',
 'count_unique_procedures_prior',
 'count_prior_admissions_with_procedure',
 'time_since_last_major_surgery_years',
 'flag_procedure_in_last_prior_admission',
 'flag_history_major_surgery',
 'flag_history_mech_vent',
 'flag_history_dialysis',
 'flag_history_biopsy']


Data from temporal_labs_feature_df:


['subject_id',
 'window_365d_count_labevents',
 'window_365d_count_unique_labs',
 'window_365d_hematocrit_avg',
 'window_365d_hematocrit_min',
 'window_365d_hematocrit_max',
 'window_365d_hematocrit_std',
 'window_365d_hemoglobin_avg',
 'window_365d_hemoglobin_min',
 'window_365d_hemoglobin_max',
 'window_365d_hemoglobin_std',
 'window_365d_albumin_avg',
 'window_365d_albumin_min',
 'window_365d_albumin_max',
 'window_365d_albumin_std',
 'window_365d_bicarbonate_avg',
 'window_365d_bicarbonate_min',
 'window_365d_bicarbonate_max',
 'window_365d_bicarbonate_std',
 'window_365d_creatinine_avg',
 'window_365d_creatinine_min',
 'window_365d_creatinine_max',
 'window_365d_creatinine_std',
 'window_365d_potassium_avg',
 'window_365d_potassium_min',
 'window_365d_potassium_max',
 'window_365d_potassium_std',
 'window_365d_sodium_avg',
 'window_365d_sodium_min',
 'window_365d_sodium_max',
 'window_365d_sodium_std',
 'window_365d_urea nitrogen_avg',
 'window_365d_urea nitrogen_min',
 'window_36


Data from time_to_death_df:


['subject_id',
 'hadm_id',
 'admittime',
 'dischtime',
 'age',
 'gender',
 'race',
 'insurance',
 'label',
 'dod',
 'time_to_death',
 'los',
 'admission_type',
 'admission_location',
 'discharge_location']

In [20]:
# Merge all filtered data into a single DataFrame using the colunm 'subject_id'
merged_data = pd.DataFrame()
for file_name, df in filtered_data.items():
    # Merge the DataFrame with the merged_data DataFrame
    if merged_data.empty:
        merged_data = df
    else:
        merged_data = pd.merge(merged_data, df, on='subject_id', how='outer')
logger.info(f"Merged all filtered data for subject id {subject_id}.")
# Display the merged data
print("\nMerged data:")
display(merged_data.head())

[32m2025-04-21 15:34:27.408[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mMerged all filtered data for subject id 10001884.[0m



Merged data:


Unnamed: 0,subject_id,hadm_id_x,count_prior_admissions,count_unique_diagnoses_prior,avg_diagnoses_per_prior_admission,time_since_last_admission_days,admission_frequency_last_year,flag_history_CHF_ICD_CODES,flag_history_DIABETES_ICD_CODES,flag_history_CKD_ICD_CODES,flag_history_CANCER_ICD_CODES,flag_history_COPD_ICD_CODES,flag_history_LIVER_DISEASE_ICD_CODES,flag_history_MI_ICD_CODES,flag_history_STROKE_ICD_CODES,flag_history_SEPSIS_ICD_CODES,flag_history_AKI_ICD_CODES,count_prior_admissions_with_CHF_ICD_CODES,time_since_first_diagnosis_CHF_ICD_CODES_years,count_prior_admissions_with_DIABETES_ICD_CODES,time_since_first_diagnosis_DIABETES_ICD_CODES_years,count_prior_admissions_with_CKD_ICD_CODES,time_since_first_diagnosis_CKD_ICD_CODES_years,count_prior_admissions_with_CANCER_ICD_CODES,time_since_first_diagnosis_CANCER_ICD_CODES_years,count_prior_admissions_with_COPD_ICD_CODES,time_since_first_diagnosis_COPD_ICD_CODES_years,count_prior_admissions_with_LIVER_DISEASE_ICD_CODES,time_since_first_diagnosis_LIVER_DISEASE_ICD_CODES_years,count_prior_admissions_with_MI_ICD_CODES,time_since_first_diagnosis_MI_ICD_CODES_years,count_prior_admissions_with_STROKE_ICD_CODES,time_since_first_diagnosis_STROKE_ICD_CODES_years,count_prior_admissions_with_SEPSIS_ICD_CODES,time_since_first_diagnosis_SEPSIS_ICD_CODES_years,count_prior_admissions_with_AKI_ICD_CODES,time_since_first_diagnosis_AKI_ICD_CODES_years,count_prior_labevents,count_unique_labs_tested_prior,albumin_prior_avg,albumin_prior_min,albumin_prior_max,albumin_prior_std,bicarbonate_prior_avg,bicarbonate_prior_min,bicarbonate_prior_max,bicarbonate_prior_std,creatinine_prior_avg,creatinine_prior_min,creatinine_prior_max,...,window_7d_sodium_std,window_7d_urea nitrogen_avg,window_7d_urea nitrogen_min,window_7d_urea nitrogen_max,window_7d_urea nitrogen_std,window_7d_hematocrit_avg,window_7d_hematocrit_min,window_7d_hematocrit_max,window_7d_hematocrit_std,window_7d_hemoglobin_avg,window_7d_hemoglobin_min,window_7d_hemoglobin_max,window_7d_hemoglobin_std,window_7d_wbc_avg,window_7d_wbc_min,window_7d_wbc_max,window_7d_wbc_std,window_7d_lactate_avg,window_7d_lactate_min,window_7d_lactate_max,window_7d_lactate_std,window_7d_last_creatinine,window_7d_last_wbc,window_7d_last_hematocrit,window_7d_last_hemoglobin,window_7d_last_sodium,window_7d_last_bilirubin,window_7d_last_potassium,window_7d_last_glucose,window_7d_last_urea nitrogen,window_7d_last_albumin,window_7d_last_lactate,window_7d_last_bun,window_7d_last_bicarbonate,window_7d_count_severe_hyponatremia,window_7d_flag_chronic_anemia,hadm_id,admittime,dischtime,age,gender,race,insurance,label,dod,time_to_death,los,admission_type,admission_location,discharge_location
0,10001884,26184834,21,80,10.428571,8,13,0,0,0,0,1,0,0,0,0,1,0,,0,,0,,0,,13,5.11,0,,0,,0,,0,,1,0.03,649,11,4.233333,4.1,4.4,0.094281,30.155844,24.0,37.0,2.296609,0.925641,0.7,1.2,...,2.796912,25.315789,14.0,38.0,7.1455,29.895,22.9,39.7,5.739902,9.463636,7.0,13.0,1.950885,8.0,0.0,21.0,9.273618,1.95,1.1,4.0,0.973396,0.6,21.0,22.9,7.0,138.0,,4.2,94.0,15.0,2.6,1.2,,37.0,0,1,26184834,2131-01-07 20:39:00,2131-01-20 05:15:00,68,F,BLACK/AFRICAN AMERICAN,Medicare,1,2131-01-20 00:00:00,12.139583,12,OBSERVATION ADMIT,EMERGENCY ROOM,DIED


In [None]:
# See columns of the merged data
merged_data.columns.to_list()

['subject_id',
 'hadm_id_x',
 'count_prior_admissions',
 'count_unique_diagnoses_prior',
 'avg_diagnoses_per_prior_admission',
 'time_since_last_admission_days',
 'admission_frequency_last_year',
 'flag_history_CHF_ICD_CODES',
 'flag_history_DIABETES_ICD_CODES',
 'flag_history_CKD_ICD_CODES',
 'flag_history_CANCER_ICD_CODES',
 'flag_history_COPD_ICD_CODES',
 'flag_history_LIVER_DISEASE_ICD_CODES',
 'flag_history_MI_ICD_CODES',
 'flag_history_STROKE_ICD_CODES',
 'flag_history_SEPSIS_ICD_CODES',
 'flag_history_AKI_ICD_CODES',
 'count_prior_admissions_with_CHF_ICD_CODES',
 'time_since_first_diagnosis_CHF_ICD_CODES_years',
 'count_prior_admissions_with_DIABETES_ICD_CODES',
 'time_since_first_diagnosis_DIABETES_ICD_CODES_years',
 'count_prior_admissions_with_CKD_ICD_CODES',
 'time_since_first_diagnosis_CKD_ICD_CODES_years',
 'count_prior_admissions_with_CANCER_ICD_CODES',
 'time_since_first_diagnosis_CANCER_ICD_CODES_years',
 'count_prior_admissions_with_COPD_ICD_CODES',
 'time_since_first_

In [26]:
# Read /mnt/d/Projects/Hurdle/Time_to_death_modelling/data/processed/hosp_ttl.csv
hosp_ttl = pd.read_csv("../data/processed/hosp_ttl.csv")
hosp_ttl.head()


Unnamed: 0,subject_id,count_prior_admissions,count_unique_diagnoses_prior,avg_diagnoses_per_prior_admission,time_since_last_admission_days,admission_frequency_last_year,flag_history_CHF_ICD_CODES,flag_history_DIABETES_ICD_CODES,flag_history_CKD_ICD_CODES,flag_history_CANCER_ICD_CODES,flag_history_COPD_ICD_CODES,flag_history_LIVER_DISEASE_ICD_CODES,flag_history_MI_ICD_CODES,flag_history_STROKE_ICD_CODES,flag_history_SEPSIS_ICD_CODES,flag_history_AKI_ICD_CODES,count_prior_admissions_with_CHF_ICD_CODES,time_since_first_diagnosis_CHF_ICD_CODES_years,count_prior_admissions_with_DIABETES_ICD_CODES,time_since_first_diagnosis_DIABETES_ICD_CODES_years,count_prior_admissions_with_CKD_ICD_CODES,time_since_first_diagnosis_CKD_ICD_CODES_years,count_prior_admissions_with_CANCER_ICD_CODES,time_since_first_diagnosis_CANCER_ICD_CODES_years,count_prior_admissions_with_COPD_ICD_CODES,time_since_first_diagnosis_COPD_ICD_CODES_years,count_prior_admissions_with_LIVER_DISEASE_ICD_CODES,time_since_first_diagnosis_LIVER_DISEASE_ICD_CODES_years,count_prior_admissions_with_MI_ICD_CODES,time_since_first_diagnosis_MI_ICD_CODES_years,count_prior_admissions_with_STROKE_ICD_CODES,time_since_first_diagnosis_STROKE_ICD_CODES_years,count_prior_admissions_with_SEPSIS_ICD_CODES,time_since_first_diagnosis_SEPSIS_ICD_CODES_years,count_prior_admissions_with_AKI_ICD_CODES,time_since_first_diagnosis_AKI_ICD_CODES_years,count_prior_labevents,count_unique_labs_tested_prior,albumin_prior_avg,albumin_prior_min,albumin_prior_max,albumin_prior_std,bicarbonate_prior_avg,bicarbonate_prior_min,bicarbonate_prior_max,bicarbonate_prior_std,creatinine_prior_avg,creatinine_prior_min,creatinine_prior_max,creatinine_prior_std,...,window_7d_sodium_std,window_7d_urea nitrogen_avg,window_7d_urea nitrogen_min,window_7d_urea nitrogen_max,window_7d_urea nitrogen_std,window_7d_hematocrit_avg,window_7d_hematocrit_min,window_7d_hematocrit_max,window_7d_hematocrit_std,window_7d_hemoglobin_avg,window_7d_hemoglobin_min,window_7d_hemoglobin_max,window_7d_hemoglobin_std,window_7d_wbc_avg,window_7d_wbc_min,window_7d_wbc_max,window_7d_wbc_std,window_7d_lactate_avg,window_7d_lactate_min,window_7d_lactate_max,window_7d_lactate_std,window_7d_last_creatinine,window_7d_last_wbc,window_7d_last_hematocrit,window_7d_last_hemoglobin,window_7d_last_sodium,window_7d_last_bilirubin,window_7d_last_potassium,window_7d_last_glucose,window_7d_last_urea nitrogen,window_7d_last_albumin,window_7d_last_lactate,window_7d_last_bun,window_7d_last_bicarbonate,window_7d_count_severe_hyponatremia,window_7d_flag_chronic_anemia,hadm_id,admittime,dischtime,age,gender,race,insurance,label,dod,time_to_death,los,admission_type,admission_location,discharge_location
0,10001884,21,80,10.428571,8,13,0,0,0,0,1,0,0,0,0,1,0,,0,,0,,0,,13,5.11,0,,0,,0,,0,,1,0.03,649,11,4.233333,4.1,4.4,0.094281,30.155844,24.0,37.0,2.296609,0.925641,0.7,1.2,0.110286,...,2.796912,25.315789,14.0,38.0,7.1455,29.895,22.9,39.7,5.739902,9.463636,7.0,13.0,1.950885,8.0,0.0,21.0,9.273618,1.95,1.1,4.0,0.973396,0.6,21.0,22.9,7.0,138.0,,4.2,94.0,15.0,2.6,1.2,,37.0,0,1,26184834,2131-01-07 20:39:00,2131-01-20 05:15:00,68,F,BLACK/AFRICAN AMERICAN,Medicare,1,2131-01-20 00:00:00,12.139583,12,OBSERVATION ADMIT,EMERGENCY ROOM,DIED
1,10002155,4,41,13.5,161,1,1,0,1,0,0,0,1,1,0,1,2,1.59,0,,2,1.59,0,,0,,0,,1,1.59,1,2.61,0,,1,1.21,425,10,3.786667,3.4,4.5,0.29409,25.72,19.0,29.0,2.172924,1.750909,0.9,3.0,0.626933,...,2.061553,47.0,45.0,48.0,1.224745,22.166667,18.7,25.4,2.740235,8.05,7.5,8.6,0.55,,,,,5.05,2.4,7.7,1.967867,1.5,,25.4,8.6,125.0,,5.7,192.0,48.0,3.5,4.2,,25.0,3,1,20345487,2131-03-09 20:33:00,2131-03-10 01:55:00,80,F,WHITE,Other,1,2131-03-10 00:00:00,0.14375,0,EW EMER.,EMERGENCY ROOM,DIED
2,10003400,6,75,19.166667,137,5,1,0,1,0,0,0,0,0,0,1,2,0.65,0,,5,0.75,0,,0,,0,,0,,0,,0,,2,0.65,587,11,2.74,2.3,3.0,0.185472,25.373134,10.0,33.0,5.341574,0.757353,0.4,1.4,0.229652,...,3.330768,35.513514,27.0,41.0,3.613949,23.759574,16.0,27.2,2.232322,7.829787,5.2,9.3,0.813428,62.666667,14.0,102.0,36.527007,1.146154,0.8,1.9,0.324903,1.2,14.0,24.4,7.9,142.0,,3.9,78.0,39.0,2.3,0.8,,20.0,0,1,23559586,2137-08-04 00:07:00,2137-09-02 17:05:00,72,F,BLACK/AFRICAN AMERICAN,Medicare,1,2137-09-02 00:00:00,28.995139,29,URGENT,TRANSFER FROM HOSPITAL,DIED
3,10004401,11,99,21.090909,13,7,1,0,1,0,0,0,0,0,1,1,11,3.0,0,,3,1.77,0,,0,,0,,0,,0,,3,0.37,6,1.33,1232,11,3.333333,2.6,4.4,0.612826,25.462585,11.0,34.0,3.580507,1.498013,0.8,3.5,0.558317,...,4.26947,93.142857,74.0,119.0,13.881422,27.095238,19.8,32.8,3.024578,8.6625,6.2,10.0,1.039155,,,,,2.1,2.1,2.1,0.0,2.4,,32.3,9.8,142.0,,4.6,100.0,85.0,,2.1,,21.0,0,1,25777141,2144-06-05 19:45:00,2144-06-18 21:30:00,82,M,WHITE,Medicare,1,2144-06-18 00:00:00,12.177083,13,EW EMER.,EMERGENCY ROOM,DIED
4,10005817,1,26,26.0,744,0,1,1,0,0,1,0,1,0,0,0,1,2.06,1,2.06,0,,0,,1,2.06,0,,1,2.06,0,,0,,0,,115,11,3.2,3.2,3.2,0.0,27.0,22.0,32.0,2.828427,0.981818,0.8,1.1,0.093597,...,2.895207,88.666667,48.0,127.0,24.444949,30.65,24.0,35.0,3.373661,10.118182,8.1,11.5,0.964194,9.0,9.0,9.0,0.0,2.031818,1.4,2.9,0.412736,2.7,9.0,26.2,8.9,131.0,,3.9,161.0,102.0,2.3,1.4,,28.0,0,1,28661809,2135-01-03 21:54:00,2135-01-19 18:36:00,66,M,WHITE,Medicare,1,2135-01-19 00:00:00,15.0875,15,OBSERVATION ADMIT,TRANSFER FROM HOSPITAL,DIED
