In [14]:
import pandas as pd

In [15]:
patient_df = pd.read_csv("patient_cleaned.csv")
apache_df = pd.read_csv("processed_apacheApsVar.csv")
diagnosis_df = pd.read_csv("processed_diagnosis.csv")
lab_df = pd.read_csv("processed_lab.csv")
treatment_df = pd.read_csv("processed_treatment.csv")


In [16]:
# Drop duplicate rows based on patientunitstayid
patient_df = patient_df.drop_duplicates(subset="patientunitstayid")
apache_df = apache_df.drop_duplicates(subset="patientunitstayid")
diagnosis_df = diagnosis_df.drop_duplicates(subset="patientunitstayid")
lab_df = lab_df.drop_duplicates(subset="patientunitstayid")
treatment_df = treatment_df.drop_duplicates(subset="patientunitstayid")

In [17]:
# Merge datasets one by one on patientunitstayid
merged_df = patient_df.merge(apache_df, on="patientunitstayid", how="left") \
                      .merge(diagnosis_df, on="patientunitstayid", how="left") \
                      .merge(lab_df, on="patientunitstayid", how="left") \
                      .merge(treatment_df, on="patientunitstayid", how="left")

In [18]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,patientunitstayid,gender,age,ethnicity,apacheadmissiondx,unittype,unitadmittime24,unitadmitsource,unitstaytype,...,Hgb,WBC x 1000,albumin,anion gap,creatinine,pH,platelets x 1000,total bilirubin,treatmentstring,has_Vasopressor
0,0,141168,Female,70,Caucasian,"Rhythm disturbance (atrial, supraventricular)",Med-Surg ICU,15:54:00,Direct Admit,admit,...,11.4,19.8,3.0,25.0,260.78,7.2,213.0,88.92,,
1,3,141194,Male,68,Caucasian,"Sepsis, renal/UTI (including bladder)",CTICU,07:18:00,Floor,admit,...,8.4,4.4,2.3,11.0,154.7,7.31,139.0,6.84,,
2,5,141197,Male,71,Caucasian,"Sepsis, pulmonary",Med-Surg ICU,20:46:00,Emergency Department,admit,...,11.5,29.5,2.9,15.0,82.212,,589.0,5.13,,
3,6,141203,Female,77,Caucasian,"Arrest, respiratory (without cardiac arrest)",Med-Surg ICU,20:39:00,Floor,admit,...,11.3,12.7,1.4,15.0,49.504,7.45,557.0,6.84,,
4,7,141208,Female,25,Caucasian,"Overdose, sedatives, hypnotics, antipsychotics...",Med-Surg ICU,11:24:00,Emergency Department,admit,...,14.3,11.1,4.7,16.0,70.72,,440.0,11.97,,


In [21]:
columns = ["patientunitstayid", "gender", "age","ethnicity","admission_type","unitdischargestatus","unitdischargeoffset","has_COPD","has_Diabetes","has_Metastasis","GCS","vent","has_Vasopressor","WBC x 1000","Hgb","platelets x 1000","total bilirubin","creatinine","albumin","pH","anion gap"]

In [22]:
merged_df = merged_df[columns]

In [23]:
# Check for missing values
missing_values = merged_df.isnull().sum()

In [24]:
missing_values

patientunitstayid          0
gender                     0
age                        0
ethnicity                  0
admission_type             0
unitdischargestatus        0
unitdischargeoffset        0
has_COPD                6760
has_Diabetes            6760
has_Metastasis          6760
GCS                    10174
vent                   10174
has_Vasopressor        26643
WBC x 1000              5529
Hgb                     5769
platelets x 1000        8001
total bilirubin        51276
creatinine              4729
albumin                43824
pH                     95183
anion gap              39456
dtype: int64

In [25]:
# Drop columns with more than 50% missing values
columns_to_drop = missing_values[missing_values > 0.5 * len(merged_df)].index
merged_df = merged_df.drop(columns=columns_to_drop)


In [26]:
# Check for missing values again
missing_values = merged_df.isnull().sum()
missing_values

patientunitstayid          0
gender                     0
age                        0
ethnicity                  0
admission_type             0
unitdischargestatus        0
unitdischargeoffset        0
has_COPD                6760
has_Diabetes            6760
has_Metastasis          6760
GCS                    10174
vent                   10174
has_Vasopressor        26643
WBC x 1000              5529
Hgb                     5769
platelets x 1000        8001
total bilirubin        51276
creatinine              4729
albumin                43824
anion gap              39456
dtype: int64

In [27]:
# drop rows with missing values
merged_df = merged_df.dropna()

In [28]:
# Check for missing values again
missing_values = merged_df.isnull().sum()
missing_values

patientunitstayid      0
gender                 0
age                    0
ethnicity              0
admission_type         0
unitdischargestatus    0
unitdischargeoffset    0
has_COPD               0
has_Diabetes           0
has_Metastasis         0
GCS                    0
vent                   0
has_Vasopressor        0
WBC x 1000             0
Hgb                    0
platelets x 1000       0
total bilirubin        0
creatinine             0
albumin                0
anion gap              0
dtype: int64

In [29]:
len(merged_df)

79461

In [30]:
# Save the final merged dataset
merged_df.to_csv("final_merged_dataset.csv", index=False)


In [31]:
len(merged_df.patientunitstayid.unique())

79461