# Patient

Choose `unitDischargeStatus` as the target variable.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
random.seed(1) 
np.random.seed(1)

### Load Data

In [None]:
file_path = r"E:\EICU\eicu-collaborative-research-database-2.0\patient.csv.gz"
patient_df = pd.read_csv(file_path, compression="gzip", low_memory=False)

In [None]:
patient_df.head()

In [None]:
patient_df.dtypes

In [None]:
patient_df.shape

In [None]:
patient_df.describe().transpose()

In [None]:
missing_values = patient_df.isnull().sum()
missing_percentage = (missing_values / len(patient_df)) * 100
missing_df = pd.DataFrame({"Missing Count": missing_values, "Missing Percentage (%)": missing_percentage})
missing_df = missing_df[missing_df["Missing Count"] > 0].sort_values(by="Missing Percentage (%)", ascending=False)
missing_df

### drop missing values

In [None]:
patient_df_cleaned = patient_df.dropna()
patient_df_cleaned.isnull().sum()

In [None]:
patient_df_cleaned.shape

In [None]:
patient_df_cleaned["unitdischargestatus"].value_counts()

### convert age to categorical

In [None]:
patient_df_cleaned.loc[:, "age"] = patient_df_cleaned["age"].replace("> 89", 90)

In [None]:
patient_df_cleaned.loc[:,"age"] = pd.to_numeric(patient_df_cleaned["age"], errors="coerce") 

In [None]:
age_bins = [0, 44, 54, 64, 74, np.inf]  # 5 bins
age_labels = ["≤44", "45-54", "55-64", "65-74", "≥75"] # corresponding labels
patient_df_cleaned.loc[:,"age_group"] = pd.cut(patient_df_cleaned["age"], bins=age_bins, labels=age_labels, right=True)

In [None]:
patient_df_cleaned["age_group"].value_counts()

In [None]:
patient_df_cleaned.isnull().sum()

In [None]:
print(patient_df_cleaned["age"].min(), patient_df_cleaned["age"].max()) 

In [None]:
missing_age_group_rows = patient_df_cleaned[patient_df_cleaned["age_group"].isnull()]

In [None]:
missing_age_group_rows

In [None]:
patient_df_cleaned['age'].describe()

In [None]:
patient_df_cleaned.loc[patient_df_cleaned["age"] == 0, "age_group"] = "55-64"

In [None]:
patient_df_cleaned.isnull().sum()

In [None]:
patient_df_cleaned.info()

In [None]:
patient_df_cleaned['unitvisitnumber'].value_counts()

### Convert 'unitadmittime24' to categorical

In [None]:
patient_df_cleaned["unit_admit_hour"] = patient_df_cleaned["unitadmittime24"].str.slice(0, 2).astype(int)

def classify_time(hour):
    if 8 <= hour < 16:
        return "08:00-16:00"
    elif 16 <= hour < 24:
        return "16:00-24:00"
    else:
        return "00:00-08:00"

patient_df_cleaned["unit_admit_period"] = patient_df_cleaned["unit_admit_hour"].apply(classify_time)


In [None]:
patient_df_cleaned["unit_admit_period"].value_counts()

In [None]:
patient_df_cleaned.isnull().sum()

In [None]:
patient_df_cleaned

In [None]:
patient_df_cleaned['apacheadmissiondx'].value_counts()

### Drop unnecessary columns

In [None]:
patient_df_cleaned = patient_df_cleaned.drop(columns=["unit_admit_hour","unitvisitnumber"])

In [None]:
patient_df_cleaned.head()

In [None]:
# admission

def classify_admission(source):
    if isinstance(source, str):
        emergency_keywords = ["Emergency", "ICU"]
        elective_keywords = ["Acute Care", "Chest Pain Center","Direct Admit", "Floor", "Observation", "Operating Room", "PACU", "Recovery", "Step-Down", "Other Hospital","Other"]

        if any(keyword in source for keyword in emergency_keywords):
            return "Emergency"
        elif any(keyword in source for keyword in elective_keywords):
            return "Elective"
        else:
            return "Unknown"
    
# apply the function to patient_df
patient_df_cleaned["admission_type"] = patient_df["unitadmitsource"].apply(classify_admission)


In [None]:
patient_df_cleaned["admission_type"].value_counts()

In [None]:
patient_df_cleaned.to_csv("patient_cleaned.csv")

In [None]:
patient_df_cleaned

In [None]:
patient_df_cleaned["patientunitstayid"].nunique()

In [None]:
patient_df_cleaned["ethnicity"].value_counts()