Merge multiple datasets on patientunitstayid to create a cohort dataset.

In [2]:
import pandas as pd

In [48]:
patient_df = pd.read_csv("patient_cleaned.csv")
apache_df = pd.read_csv("processed_apacheApsVar.csv")
diagnosis_df = pd.read_csv("processed_diagnosis.csv")
lab_df = pd.read_csv("processed_lab.csv")
treatment_df = pd.read_csv("processed_treatment.csv")


In [49]:
# Drop duplicate rows based on patientunitstayid
patient_df = patient_df.drop_duplicates(subset="patientunitstayid")
apache_df = apache_df.drop_duplicates(subset="patientunitstayid")
diagnosis_df = diagnosis_df.drop_duplicates(subset="patientunitstayid")
lab_df = lab_df.drop_duplicates(subset="patientunitstayid")
treatment_df = treatment_df.drop_duplicates(subset="patientunitstayid")

In [50]:
# Merge datasets one by one on patientunitstayid
merged_df = patient_df.merge(apache_df, on="patientunitstayid", how="left") \
                      .merge(diagnosis_df, on="patientunitstayid", how="left") \
                      .merge(lab_df, on="patientunitstayid", how="left") \
                      .merge(treatment_df, on="patientunitstayid", how="left")

In [51]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,apacheadmissiondx,unittype,unitadmittime24,unitadmitsource,...,Hgb,WBC x 1000,albumin,anion gap,creatinine,pH,platelets x 1000,total bilirubin,treatmentstring,has_Vasopressor
0,0,141168,Female,70,Caucasian,59,"Rhythm disturbance (atrial, supraventricular)",Med-Surg ICU,15:54:00,Direct Admit,...,11.4,19.8,3.0,25.0,260.78,7.2,213.0,88.92,,
1,3,141194,Male,68,Caucasian,73,"Sepsis, renal/UTI (including bladder)",CTICU,07:18:00,Floor,...,8.4,4.4,2.3,11.0,154.7,7.31,139.0,6.84,,
2,5,141197,Male,71,Caucasian,67,"Sepsis, pulmonary",Med-Surg ICU,20:46:00,Emergency Department,...,11.5,29.5,2.9,15.0,82.212,,589.0,5.13,,
3,6,141203,Female,77,Caucasian,66,"Arrest, respiratory (without cardiac arrest)",Med-Surg ICU,20:39:00,Floor,...,11.3,12.7,1.4,15.0,49.504,7.45,557.0,6.84,,
4,7,141208,Female,25,Caucasian,73,"Overdose, sedatives, hypnotics, antipsychotics...",Med-Surg ICU,11:24:00,Emergency Department,...,14.3,11.1,4.7,16.0,70.72,,440.0,11.97,,


In [52]:
columns = ["patientunitstayid", "gender", "age","ethnicity","hospitalid","admission_type","unitdischargestatus","unitdischargeoffset","has_COPD","has_Diabetes","has_Metastasis","GCS","vent","has_Vasopressor","WBC x 1000","Hgb","platelets x 1000","total bilirubin","creatinine","albumin","pH","anion gap"]

In [53]:
merged_df = merged_df[columns]

In [54]:
# Check for missing values
missing_values = merged_df.isnull().sum()

In [55]:
missing_values

patientunitstayid          0
gender                     0
age                        0
ethnicity                  0
hospitalid                 0
admission_type             0
unitdischargestatus        0
unitdischargeoffset        0
has_COPD                6760
has_Diabetes            6760
has_Metastasis          6760
GCS                    10174
vent                   10174
has_Vasopressor        26643
WBC x 1000              5529
Hgb                     5769
platelets x 1000        8001
total bilirubin        51276
creatinine              4729
albumin                43824
pH                     95183
anion gap              39456
dtype: int64

In [56]:
# Drop columns with more than 50% missing values
columns_to_drop = missing_values[missing_values > 0.5 * len(merged_df)].index
merged_df = merged_df.drop(columns=columns_to_drop)


In [57]:
# Check for missing values again
missing_values = merged_df.isnull().sum()
missing_values

patientunitstayid          0
gender                     0
age                        0
ethnicity                  0
hospitalid                 0
admission_type             0
unitdischargestatus        0
unitdischargeoffset        0
has_COPD                6760
has_Diabetes            6760
has_Metastasis          6760
GCS                    10174
vent                   10174
has_Vasopressor        26643
WBC x 1000              5529
Hgb                     5769
platelets x 1000        8001
total bilirubin        51276
creatinine              4729
albumin                43824
anion gap              39456
dtype: int64

In [58]:
# drop rows with missing values
merged_df = merged_df.dropna()

In [59]:
# Check for missing values again
missing_values = merged_df.isnull().sum()
missing_values

patientunitstayid      0
gender                 0
age                    0
ethnicity              0
hospitalid             0
admission_type         0
unitdischargestatus    0
unitdischargeoffset    0
has_COPD               0
has_Diabetes           0
has_Metastasis         0
GCS                    0
vent                   0
has_Vasopressor        0
WBC x 1000             0
Hgb                    0
platelets x 1000       0
total bilirubin        0
creatinine             0
albumin                0
anion gap              0
dtype: int64

In [60]:
len(merged_df)

79461

In [61]:
# Save the final merged dataset
merged_df.to_csv("final_merged_dataset.csv", index=False)


In [62]:
len(merged_df.patientunitstayid.unique())

79461

### encoding

In [63]:
df = pd.read_csv("final_merged_dataset.csv")

category_mapping = {
    'platelets_category': {
        'bins': [-float('inf'), 50, 100, float('inf')],
        'labels': [0, 1, 2]
    },
    'GCS_category': {
        'bins': [-float('inf'), 9, 15],
        'labels': [0, 1]
    },
    'total_bilirubin_category': {
        'bins': [-float('inf'), 101, float('inf')],
        'labels': [0, 1]
    },
    'creatinine_category': {
        'bins': [-float('inf'), 110, 300, float('inf')],
        'labels': [0, 1, 2]
    }
}

df['platelets_category'] = pd.cut(df['platelets x 1000'], bins=category_mapping['platelets_category']['bins'], labels=category_mapping['platelets_category']['labels'])
df['GCS_category'] = pd.cut(df['GCS'], bins=category_mapping['GCS_category']['bins'], labels=category_mapping['GCS_category']['labels'])
df['total_bilirubin_category'] = pd.cut(df['total bilirubin'], bins=category_mapping['total_bilirubin_category']['bins'], labels=category_mapping['total_bilirubin_category']['labels'])
df['creatinine_category'] = pd.cut(df['creatinine'], bins=category_mapping['creatinine_category']['bins'], labels=category_mapping['creatinine_category']['labels'])

df = df.drop(columns=['platelets x 1000', 'GCS', 'total bilirubin', 'creatinine'])
df.to_csv("final_merged_dataset_categorized.csv", index=False,encoding='utf-8')

In [65]:
df.head()

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,admission_type,unitdischargestatus,unitdischargeoffset,has_COPD,has_Diabetes,...,vent,has_Vasopressor,WBC x 1000,Hgb,albumin,anion gap,platelets_category,GCS_category,total_bilirubin_category,creatinine_category
0,242070,Female,68,Caucasian,79,Elective,Alive,1481,0.0,0.0,...,0.0,0.0,8.0,14.5,3.2,12.0,2,1,0,0
1,242082,Male,75,Caucasian,108,Emergency,Alive,2752,0.0,0.0,...,0.0,0.0,7.18,13.1,4.1,11.0,2,1,0,0
2,242083,Female,90,Caucasian,92,Emergency,Alive,1546,0.0,0.0,...,0.0,0.0,10.0,14.7,4.2,18.0,2,1,0,0
3,242154,Female,46,Caucasian,79,Elective,Alive,8792,0.0,0.0,...,1.0,0.0,21.3,10.5,2.8,8.0,2,0,0,0
4,242290,Female,75,Other/Unknown,79,Elective,Alive,1656,0.0,0.0,...,0.0,0.0,12.4,11.1,2.6,9.0,2,1,0,1


In [66]:
df = pd.read_csv("final_merged_dataset_categorized.csv")

In [67]:
gender_dummies = pd.get_dummies(df["gender"], prefix="gender")
df_encoded = pd.concat([df, gender_dummies], axis=1)

In [68]:
df_encoded.head()

In [69]:
admission_mapping = {"Emergency": 1, "Elective": 0}

df_encoded["admission_type_encoded"] = df_encoded["admission_type"].map(admission_mapping)

In [70]:
df_encoded.head()

In [71]:
df_encoded = pd.get_dummies(df_encoded, columns=["ethnicity"], prefix="ethnicity")

In [72]:
df_encoded.head()

In [73]:
df["status_of_survive"] = df["unitdischargestatus"].map({"Alive": 0, "Expired": 1})

In [74]:
df

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,admission_type,unitdischargestatus,unitdischargeoffset,has_COPD,has_Diabetes,...,has_Vasopressor,WBC x 1000,Hgb,albumin,anion gap,platelets_category,GCS_category,total_bilirubin_category,creatinine_category,status_of_survive
0,242070,Female,68,Caucasian,79,Elective,Alive,1481,0.0,0.0,...,0.0,8.00,14.5,3.2,12.0,2,1,0,0,0
1,242082,Male,75,Caucasian,108,Emergency,Alive,2752,0.0,0.0,...,0.0,7.18,13.1,4.1,11.0,2,1,0,0,0
2,242083,Female,90,Caucasian,92,Emergency,Alive,1546,0.0,0.0,...,0.0,10.00,14.7,4.2,18.0,2,1,0,0,0
3,242154,Female,46,Caucasian,79,Elective,Alive,8792,0.0,0.0,...,0.0,21.30,10.5,2.8,8.0,2,0,0,0,0
4,242290,Female,75,Other/Unknown,79,Elective,Alive,1656,0.0,0.0,...,0.0,12.40,11.1,2.6,9.0,2,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79456,3353216,Female,50,African American,458,Elective,Alive,3090,0.0,0.0,...,0.0,7.50,8.3,2.8,4.0,2,0,0,0,0
79457,3353226,Female,79,African American,458,Emergency,Expired,11513,0.0,0.0,...,0.0,12.50,10.0,3.0,6.0,2,0,0,2,1
79458,3353237,Female,79,Caucasian,458,Elective,Alive,1269,0.0,0.0,...,0.0,8.60,12.7,3.5,12.0,2,1,0,0,0
79459,3353251,Male,73,African American,458,Emergency,Alive,16259,0.0,0.0,...,0.0,21.50,10.1,3.0,10.0,2,0,0,2,0


In [75]:
df_encoded = df.drop(columns=["unitdischargestatus"])

In [76]:
df_encoded

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,admission_type,unitdischargeoffset,has_COPD,has_Diabetes,has_Metastasis,...,has_Vasopressor,WBC x 1000,Hgb,albumin,anion gap,platelets_category,GCS_category,total_bilirubin_category,creatinine_category,status_of_survive
0,242070,Female,68,Caucasian,79,Elective,1481,0.0,0.0,0.0,...,0.0,8.00,14.5,3.2,12.0,2,1,0,0,0
1,242082,Male,75,Caucasian,108,Emergency,2752,0.0,0.0,0.0,...,0.0,7.18,13.1,4.1,11.0,2,1,0,0,0
2,242083,Female,90,Caucasian,92,Emergency,1546,0.0,0.0,0.0,...,0.0,10.00,14.7,4.2,18.0,2,1,0,0,0
3,242154,Female,46,Caucasian,79,Elective,8792,0.0,0.0,0.0,...,0.0,21.30,10.5,2.8,8.0,2,0,0,0,0
4,242290,Female,75,Other/Unknown,79,Elective,1656,0.0,0.0,0.0,...,0.0,12.40,11.1,2.6,9.0,2,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79456,3353216,Female,50,African American,458,Elective,3090,0.0,0.0,0.0,...,0.0,7.50,8.3,2.8,4.0,2,0,0,0,0
79457,3353226,Female,79,African American,458,Emergency,11513,0.0,0.0,0.0,...,0.0,12.50,10.0,3.0,6.0,2,0,0,2,1
79458,3353237,Female,79,Caucasian,458,Elective,1269,0.0,0.0,0.0,...,0.0,8.60,12.7,3.5,12.0,2,1,0,0,0
79459,3353251,Male,73,African American,458,Emergency,16259,0.0,0.0,0.0,...,0.0,21.50,10.1,3.0,10.0,2,0,0,2,0


In [77]:
df_encoded = df_encoded[df_encoded['age']>18]

In [78]:
df_encoded

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,admission_type,unitdischargeoffset,has_COPD,has_Diabetes,has_Metastasis,...,has_Vasopressor,WBC x 1000,Hgb,albumin,anion gap,platelets_category,GCS_category,total_bilirubin_category,creatinine_category,status_of_survive
0,242070,Female,68,Caucasian,79,Elective,1481,0.0,0.0,0.0,...,0.0,8.00,14.5,3.2,12.0,2,1,0,0,0
1,242082,Male,75,Caucasian,108,Emergency,2752,0.0,0.0,0.0,...,0.0,7.18,13.1,4.1,11.0,2,1,0,0,0
2,242083,Female,90,Caucasian,92,Emergency,1546,0.0,0.0,0.0,...,0.0,10.00,14.7,4.2,18.0,2,1,0,0,0
3,242154,Female,46,Caucasian,79,Elective,8792,0.0,0.0,0.0,...,0.0,21.30,10.5,2.8,8.0,2,0,0,0,0
4,242290,Female,75,Other/Unknown,79,Elective,1656,0.0,0.0,0.0,...,0.0,12.40,11.1,2.6,9.0,2,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79456,3353216,Female,50,African American,458,Elective,3090,0.0,0.0,0.0,...,0.0,7.50,8.3,2.8,4.0,2,0,0,0,0
79457,3353226,Female,79,African American,458,Emergency,11513,0.0,0.0,0.0,...,0.0,12.50,10.0,3.0,6.0,2,0,0,2,1
79458,3353237,Female,79,Caucasian,458,Elective,1269,0.0,0.0,0.0,...,0.0,8.60,12.7,3.5,12.0,2,1,0,0,0
79459,3353251,Male,73,African American,458,Emergency,16259,0.0,0.0,0.0,...,0.0,21.50,10.1,3.0,10.0,2,0,0,2,0


In [79]:
df_encoded = df_encoded[df_encoded['gender'] != 'Unknown']
df_encoded

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,admission_type,unitdischargeoffset,has_COPD,has_Diabetes,has_Metastasis,...,has_Vasopressor,WBC x 1000,Hgb,albumin,anion gap,platelets_category,GCS_category,total_bilirubin_category,creatinine_category,status_of_survive
0,242070,Female,68,Caucasian,79,Elective,1481,0.0,0.0,0.0,...,0.0,8.00,14.5,3.2,12.0,2,1,0,0,0
1,242082,Male,75,Caucasian,108,Emergency,2752,0.0,0.0,0.0,...,0.0,7.18,13.1,4.1,11.0,2,1,0,0,0
2,242083,Female,90,Caucasian,92,Emergency,1546,0.0,0.0,0.0,...,0.0,10.00,14.7,4.2,18.0,2,1,0,0,0
3,242154,Female,46,Caucasian,79,Elective,8792,0.0,0.0,0.0,...,0.0,21.30,10.5,2.8,8.0,2,0,0,0,0
4,242290,Female,75,Other/Unknown,79,Elective,1656,0.0,0.0,0.0,...,0.0,12.40,11.1,2.6,9.0,2,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79456,3353216,Female,50,African American,458,Elective,3090,0.0,0.0,0.0,...,0.0,7.50,8.3,2.8,4.0,2,0,0,0,0
79457,3353226,Female,79,African American,458,Emergency,11513,0.0,0.0,0.0,...,0.0,12.50,10.0,3.0,6.0,2,0,0,2,1
79458,3353237,Female,79,Caucasian,458,Elective,1269,0.0,0.0,0.0,...,0.0,8.60,12.7,3.5,12.0,2,1,0,0,0
79459,3353251,Male,73,African American,458,Emergency,16259,0.0,0.0,0.0,...,0.0,21.50,10.1,3.0,10.0,2,0,0,2,0


In [80]:
df_encoded.to_csv("final_merged_dataset_encoded.csv", index=False)