# Patient

Choose `unitDischargeStatus` as the target variable.

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
random.seed(1) 
np.random.seed(1)

### Load Data

In [64]:
file_path = r"E:\EICU\eicu-collaborative-research-database-2.0\patient.csv.gz"
patient_df = pd.read_csv(file_path, compression="gzip", low_memory=False,usecols=["patientunitstayid","unittype", "unitadmitsource", "unitadmittime24", 
                    "unitvisitnumber", "unitstaytype", "age", 
                    "gender", "ethnicity", "apacheadmissiondx","unitdischargestatus"])

In [65]:
patient_df.head()

Unnamed: 0,patientunitstayid,gender,age,ethnicity,apacheadmissiondx,unittype,unitadmittime24,unitadmitsource,unitvisitnumber,unitstaytype,unitdischargestatus
0,141168,Female,70,Caucasian,"Rhythm disturbance (atrial, supraventricular)",Med-Surg ICU,15:54:00,Direct Admit,1,admit,Expired
1,141178,Female,52,Caucasian,,Med-Surg ICU,09:10:00,Emergency Department,1,admit,Alive
2,141179,Female,52,Caucasian,,Med-Surg ICU,09:18:00,ICU to SDU,2,stepdown/other,Alive
3,141194,Male,68,Caucasian,"Sepsis, renal/UTI (including bladder)",CTICU,07:18:00,Floor,1,admit,Alive
4,141196,Male,71,Caucasian,,Med-Surg ICU,22:00:00,ICU to SDU,2,stepdown/other,Alive


In [66]:
patient_df.dtypes

patientunitstayid       int64
gender                 object
age                    object
ethnicity              object
apacheadmissiondx      object
unittype               object
unitadmittime24        object
unitadmitsource        object
unitvisitnumber         int64
unitstaytype           object
unitdischargestatus    object
dtype: object

In [67]:
patient_df.shape

(200859, 11)

In [68]:
patient_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
patientunitstayid,200859.0,1674016.0,999589.819762,141168.0,831173.0,1579307.0,2695827.0,3353263.0
unitvisitnumber,200859.0,1.279614,0.655562,1.0,1.0,1.0,1.0,18.0


In [69]:
missing_values = patient_df.isnull().sum()
missing_percentage = (missing_values / len(patient_df)) * 100
missing_df = pd.DataFrame({"Missing Count": missing_values, "Missing Percentage (%)": missing_percentage})
missing_df = missing_df[missing_df["Missing Count"] > 0].sort_values(by="Missing Percentage (%)", ascending=False)
missing_df

Unnamed: 0,Missing Count,Missing Percentage (%)
apacheadmissiondx,22996,11.448827
ethnicity,2290,1.140103
unitadmitsource,1090,0.542669
gender,134,0.066713
age,95,0.047297
unitdischargestatus,34,0.016927


### drop missing values

In [70]:
patient_df_cleaned = patient_df.dropna()
patient_df_cleaned.isnull().sum()

patientunitstayid      0
gender                 0
age                    0
ethnicity              0
apacheadmissiondx      0
unittype               0
unitadmittime24        0
unitadmitsource        0
unitvisitnumber        0
unitstaytype           0
unitdischargestatus    0
dtype: int64

In [71]:
patient_df_cleaned.shape

(175400, 11)

In [72]:
patient_df_cleaned["unitdischargestatus"].value_counts()

unitdischargestatus
Alive      165349
Expired     10051
Name: count, dtype: int64

### convert age to categorical

In [73]:
patient_df_cleaned.loc[:, "age"] = patient_df_cleaned["age"].replace("> 89", 90)

In [74]:
patient_df_cleaned.loc[:,"age"] = pd.to_numeric(patient_df_cleaned["age"], errors="coerce") 

In [75]:
age_bins = [0, 44, 54, 64, 74, np.inf]  # 5 bins
age_labels = ["≤44", "45-54", "55-64", "65-74", "≥75"] # corresponding labels
patient_df_cleaned.loc[:,"age_group"] = pd.cut(patient_df_cleaned["age"], bins=age_bins, labels=age_labels, right=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_df_cleaned.loc[:,"age_group"] = pd.cut(patient_df_cleaned["age"], bins=age_bins, labels=age_labels, right=True)


In [76]:
patient_df_cleaned["age_group"].value_counts()

age_group
≥75      49846
65-74    40028
55-64    36095
≤44      25666
45-54    23749
Name: count, dtype: int64

In [77]:
patient_df_cleaned.isnull().sum()

patientunitstayid       0
gender                  0
age                     0
ethnicity               0
apacheadmissiondx       0
unittype                0
unitadmittime24         0
unitadmitsource         0
unitvisitnumber         0
unitstaytype            0
unitdischargestatus     0
age_group              16
dtype: int64

In [78]:
print(patient_df_cleaned["age"].min(), patient_df_cleaned["age"].max()) 

0 90


In [79]:
missing_age_group_rows = patient_df_cleaned[patient_df_cleaned["age_group"].isnull()]

In [80]:
missing_age_group_rows

Unnamed: 0,patientunitstayid,gender,age,ethnicity,apacheadmissiondx,unittype,unitadmittime24,unitadmitsource,unitvisitnumber,unitstaytype,unitdischargestatus,age_group
18998,330125,Male,0,Caucasian,"Overdose, alcohols (bethanol, methanol, ethyle...",Med-Surg ICU,11:00:00,Emergency Department,1,admit,Alive,
23196,403944,Female,0,Caucasian,"Renal failure, acute",Med-Surg ICU,00:14:00,Emergency Department,1,admit,Alive,
23958,417283,Female,0,Caucasian,"Rhythm disturbance (atrial, supraventricular)",Med-Surg ICU,11:59:00,Emergency Department,1,admit,Alive,
135002,2313728,Male,0,Caucasian,"Pneumonia, bacterial",Med-Surg ICU,02:43:00,ICU to SDU,2,stepdown/other,Alive,
139107,2408918,Male,0,Caucasian,Emphysema/bronchitis,Med-Surg ICU,06:53:00,Emergency Department,1,admit,Alive,
139144,2409571,Female,0,Caucasian,"Pneumonia, bacterial",Med-Surg ICU,20:24:00,Direct Admit,1,admit,Alive,
139258,2412237,Male,0,Caucasian,"Overdose, street drugs (opiates, cocaine, amph...",Med-Surg ICU,10:27:00,Emergency Department,1,admit,Alive,
139305,2413304,Male,0,Hispanic,"Pneumonia, bacterial",Med-Surg ICU,17:22:00,ICU,2,stepdown/other,Alive,
139579,2418976,Female,0,Caucasian,"Pneumonia, viral",Med-Surg ICU,07:41:00,Emergency Department,1,stepdown/other,Alive,
140001,2429243,Male,0,Caucasian,Acid-base/electrolyte disturbance,Med-Surg ICU,12:42:00,Emergency Department,1,stepdown/other,Alive,


In [81]:
patient_df_cleaned['age'].describe()

count     175400
unique        91
top           90
freq        6236
Name: age, dtype: int64

In [82]:
patient_df_cleaned.loc[patient_df_cleaned["age"] == 0, "age_group"] = "55-64"

In [83]:
patient_df_cleaned.isnull().sum()

patientunitstayid      0
gender                 0
age                    0
ethnicity              0
apacheadmissiondx      0
unittype               0
unitadmittime24        0
unitadmitsource        0
unitvisitnumber        0
unitstaytype           0
unitdischargestatus    0
age_group              0
dtype: int64

In [84]:
patient_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 175400 entries, 0 to 200858
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   patientunitstayid    175400 non-null  int64   
 1   gender               175400 non-null  object  
 2   age                  175400 non-null  object  
 3   ethnicity            175400 non-null  object  
 4   apacheadmissiondx    175400 non-null  object  
 5   unittype             175400 non-null  object  
 6   unitadmittime24      175400 non-null  object  
 7   unitadmitsource      175400 non-null  object  
 8   unitvisitnumber      175400 non-null  int64   
 9   unitstaytype         175400 non-null  object  
 10  unitdischargestatus  175400 non-null  object  
 11  age_group            175400 non-null  category
dtypes: category(1), int64(2), object(9)
memory usage: 16.2+ MB


In [85]:
patient_df_cleaned['unitvisitnumber'].value_counts()

unitvisitnumber
1     151864
2      17794
3       4070
4       1037
5        372
6        136
7         63
8         27
9         14
10        10
12         4
11         3
13         2
14         1
16         1
18         1
15         1
Name: count, dtype: int64

### Convert 'unitadmittime24' to categorical

In [87]:
patient_df_cleaned["unit_admit_hour"] = patient_df_cleaned["unitadmittime24"].str.slice(0, 2).astype(int)

def classify_time(hour):
    if 8 <= hour < 16:
        return "08:00-16:00"
    elif 16 <= hour < 24:
        return "16:00-24:00"
    else:
        return "00:00-08:00"

patient_df_cleaned["unit_admit_period"] = patient_df_cleaned["unit_admit_hour"].apply(classify_time)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_df_cleaned["unit_admit_hour"] = patient_df_cleaned["unitadmittime24"].str.slice(0, 2).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_df_cleaned["unit_admit_period"] = patient_df_cleaned["unit_admit_hour"].apply(classify_time)


In [88]:
patient_df_cleaned["unit_admit_period"].value_counts()

unit_admit_period
16:00-24:00    74913
00:00-08:00    66221
08:00-16:00    34266
Name: count, dtype: int64

In [89]:
patient_df_cleaned.isnull().sum()

patientunitstayid      0
gender                 0
age                    0
ethnicity              0
apacheadmissiondx      0
unittype               0
unitadmittime24        0
unitadmitsource        0
unitvisitnumber        0
unitstaytype           0
unitdischargestatus    0
age_group              0
unit_admit_hour        0
unit_admit_period      0
dtype: int64

In [90]:
patient_df_cleaned

Unnamed: 0,patientunitstayid,gender,age,ethnicity,apacheadmissiondx,unittype,unitadmittime24,unitadmitsource,unitvisitnumber,unitstaytype,unitdischargestatus,age_group,unit_admit_hour,unit_admit_period
0,141168,Female,70,Caucasian,"Rhythm disturbance (atrial, supraventricular)",Med-Surg ICU,15:54:00,Direct Admit,1,admit,Expired,65-74,15,08:00-16:00
3,141194,Male,68,Caucasian,"Sepsis, renal/UTI (including bladder)",CTICU,07:18:00,Floor,1,admit,Alive,65-74,7,00:00-08:00
5,141197,Male,71,Caucasian,"Sepsis, pulmonary",Med-Surg ICU,20:46:00,Emergency Department,1,admit,Alive,65-74,20,16:00-24:00
6,141203,Female,77,Caucasian,"Arrest, respiratory (without cardiac arrest)",Med-Surg ICU,20:39:00,Floor,1,admit,Alive,≥75,20,16:00-24:00
7,141208,Female,25,Caucasian,"Overdose, sedatives, hypnotics, antipsychotics...",Med-Surg ICU,11:24:00,Emergency Department,1,admit,Alive,≤44,11,08:00-16:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200854,3353235,Male,50,Caucasian,"CHF, congestive heart failure",Cardiac ICU,05:29:00,Emergency Department,1,admit,Alive,45-54,5,00:00-08:00
200855,3353237,Female,79,Caucasian,"Embolus, pulmonary",MICU,01:59:00,Direct Admit,1,admit,Alive,≥75,1,00:00-08:00
200856,3353251,Male,73,African American,Cardiac arrest (with or without respiratory ar...,Cardiac ICU,16:17:00,Emergency Department,1,admit,Alive,65-74,16,16:00-24:00
200857,3353254,Male,81,Caucasian,"Bleeding, lower GI",Med-Surg ICU,12:14:00,Emergency Department,1,admit,Alive,≥75,12,08:00-16:00


In [92]:
patient_df_cleaned['apacheadmissiondx'].value_counts()

apacheadmissiondx
Sepsis, pulmonary                                 8763
Infarction, acute myocardial (MI)                 7073
CVA, cerebrovascular accident/stroke              6599
CHF, congestive heart failure                     6542
Sepsis, renal/UTI (including bladder)             5220
                                                  ... 
Papillary muscle rupture                             2
Vena cava clipping                                   2
Thyroid neoplasm                                     1
Pelvic relaxation (cystocele, rectocele, etc.)       1
Myositis, viral                                      1
Name: count, Length: 392, dtype: int64

### Drop unnecessary columns

In [93]:
patient_df_cleaned = patient_df_cleaned.drop(columns=["age", "unitadmittime24","unit_admit_hour","unitvisitnumber"])

In [94]:
patient_df_cleaned.head()

Unnamed: 0,patientunitstayid,gender,ethnicity,apacheadmissiondx,unittype,unitadmitsource,unitstaytype,unitdischargestatus,age_group,unit_admit_period
0,141168,Female,Caucasian,"Rhythm disturbance (atrial, supraventricular)",Med-Surg ICU,Direct Admit,admit,Expired,65-74,08:00-16:00
3,141194,Male,Caucasian,"Sepsis, renal/UTI (including bladder)",CTICU,Floor,admit,Alive,65-74,00:00-08:00
5,141197,Male,Caucasian,"Sepsis, pulmonary",Med-Surg ICU,Emergency Department,admit,Alive,65-74,16:00-24:00
6,141203,Female,Caucasian,"Arrest, respiratory (without cardiac arrest)",Med-Surg ICU,Floor,admit,Alive,≥75,16:00-24:00
7,141208,Female,Caucasian,"Overdose, sedatives, hypnotics, antipsychotics...",Med-Surg ICU,Emergency Department,admit,Alive,≤44,08:00-16:00


In [96]:
patient_df_cleaned.to_csv("patient_cleaned.csv")