In [None]:
import pandas as pd

patient_file = r"E:\EICU\eicu-collaborative-research-database-2.0\patient.csv.gz"
patient_df = pd.read_csv(patient_file, compression="gzip")
patient_df = patient_df.sort_values(by=['patienthealthsystemstayid', 'hospitaladmitoffset'])
patient_df = patient_df.drop_duplicates(subset=['patienthealthsystemstayid'], keep='first')

diagnosis_file = r"E:\EICU\eicu-collaborative-research-database-2.0\diagnosis.csv.gz"
diagnosis_df = pd.read_csv(diagnosis_file, compression="gzip")


In [None]:
HF_ICD_CODES = {
    # ICD-9 codes for heart failure
    '402.01', '402.91','404.91','404.93'
    '428', '428.1', '428.2', '428.21', '428.22', '428.23',
    '428.3', '428.31', '428.32', '428.33', '428.4', '428.41', '428.42', '428.43', '428.9',

    # ICD-10 codes for heart failure
    'I509', 'I110', 'I132', 'I5020', 'I5021', 'I5022', 'I5023',
    'I5030', 'I5031', 'I5032', 'I5033', 'I5040', 'I5041', 'I5042', 'I5043'
}
diagnosis_df = diagnosis_df.dropna(subset=['icd9code'])
diagnosis_df['icd9code'] = diagnosis_df['icd9code'].astype(str).str.strip()

def contains_hf(icd_list):
    icd_codes = set(code.strip() for code in icd_list.split(','))
    return not icd_codes.isdisjoint(HF_ICD_CODES)

hf_diag = diagnosis_df[diagnosis_df['icd9code'].apply(contains_hf)]

In [None]:
hf_diag['diagnosispriority'].value_counts()

In [None]:
hf_diag

In [None]:
hf_diag['patientunitstayid'].nunique()

In [None]:
hf_diag = hf_diag[hf_diag['diagnosispriority'] == 'Primary']

hf_df = patient_df.merge(hf_diag[['patientunitstayid']], on='patientunitstayid', how='inner')

In [None]:
hf_df = hf_df[hf_df['unitdischargeoffset'] > 24 * 60]

In [None]:
def clean_age(age):
    if age == ">89":
        return 90
    elif age.isdigit():  
        return int(age)
    else:
        return None

hf_df['age'] = hf_df['age'].astype(str).str.strip().apply(clean_age)

# only keep patients aged 18 and above
aki_df = hf_df[hf_df['age'] >= 18]

In [None]:
hf_df['BMI'] = hf_df['admissionweight'] / (hf_df['admissionheight'] / 100) ** 2

In [None]:
hf_df = hf_df[['patientunitstayid', 'gender', 'age','hospitalid','unitdischargeoffset', 'unitdischargestatus', 'hospitaldischargestatus', 'BMI']]

In [None]:
hf_df.drop_duplicates(inplace=True)

In [None]:
hf_df['patientunitstayid'].nunique()

In [None]:
lab_variables = pd.read_csv('lab_variable.csv')
vital_variables = pd.read_csv('vital_summary.csv')
treatment_variables = pd.read_csv('vasopressor.csv')

In [None]:
# Merge datasets one by one on patientunitstayid
merged_df = hf_df.merge(lab_variables, on="patientunitstayid", how="left") \
                      .merge(vital_variables, on="patientunitstayid", how="left") \
                        .merge(treatment_variables, on="patientunitstayid", how="left")

In [None]:
merged_df.head()

In [None]:
missing_ratio = merged_df.isnull().mean(axis=1)
merged_df = merged_df[missing_ratio < 0.3]

In [None]:
print(f"Number of patients: {len(merged_df)}")

output_path = "hf_merged.csv"
hf_df.to_csv(output_path, index=False)
print(f"The data has been saved to {output_path}")

In [None]:
# calculate the number and percentage of missing values
na_counts = merged_df.isna().sum()
na_percentage = (na_counts / len(merged_df)) * 100

# create a summary DataFrame
na_summary = pd.DataFrame({
    "Missing Values": na_counts,
    "Missing Percentage (%)": na_percentage
})

In [None]:
na_summary

In [None]:
merged_df = merged_df.drop(columns=['nipd_systolic_avg', 'nipd_diastolic_avg', 'nipd_systolic_min', 'nipd_diastolic_min', 'nipd_systolic_max', 'nipd_diastolic_max'])

In [None]:
merged_df.dropna(inplace=True)

In [None]:
merged_df.drop(columns=['treatmentstring'], inplace=True)

In [None]:
merged_df.drop_duplicates(inplace=True)

In [None]:
merged_df.to_csv('hf_merged2.csv', index=False)