# Feature Engineering

In [1]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read in data
data = pd.read_csv('./dataset/clean_data.csv')

In [3]:
# Preview the first 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,5,33181,74489,83,0,67.0,27.56,0,Caucasian,M,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic


In [4]:
# Drop the unnamed column
data = data.drop(columns = 'Unnamed: 0', axis = 1)

In [5]:
data.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,33181,74489,83,0,67.0,27.56,0,Caucasian,M,190.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic


In [6]:
# Get the attributes
data.shape

(82965, 186)

In [7]:
# Dummify Ethnicity column
df = data['ethnicity']
ethnicity = pd.get_dummies(df, drop_first = True)

In [8]:
# Dummify the Gender column
df = data['gender']
gender = pd.get_dummies(df, drop_first = True)

In [9]:
# Dummify the ICU stay type
df = data['icu_stay_type']
icu_stay_type = pd.get_dummies(df, drop_first = True)

In [10]:
# Dummify the ICU type
df = data['icu_type']
icu_type = pd.get_dummies(df, drop_first = True)

In [11]:
# Select a few columns for first modelling iteration
df = data[['age', 'bmi', 'elective_surgery', 'height', 'pre_icu_los_days', 'readmission_status', 'weight', 'hospital_death']]
df.head()

Unnamed: 0,age,bmi,elective_surgery,height,pre_icu_los_days,readmission_status,weight,hospital_death
0,68.0,22.73,0,180.3,0.541667,0,73.9,0
1,77.0,27.42,0,160.0,0.927778,0,70.2,0
2,25.0,31.95,0,172.7,0.000694,0,95.3,0
3,81.0,22.64,1,165.1,0.000694,0,61.7,0
4,67.0,27.56,0,190.5,0.000694,0,100.0,0


In [12]:
# Concatenate all the dataframes together
df = pd.concat([df, ethnicity, gender, icu_stay_type, icu_type], axis = 1)
df.head()

Unnamed: 0,age,bmi,elective_surgery,height,pre_icu_los_days,readmission_status,weight,hospital_death,Asian,Caucasian,...,M,readmit,transfer,CSICU,CTICU,Cardiac ICU,MICU,Med-Surg ICU,Neuro ICU,SICU
0,68.0,22.73,0,180.3,0.541667,0,73.9,0,0,1,...,1,0,0,0,1,0,0,0,0,0
1,77.0,27.42,0,160.0,0.927778,0,70.2,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,25.0,31.95,0,172.7,0.000694,0,95.3,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,81.0,22.64,1,165.1,0.000694,0,61.7,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,67.0,27.56,0,190.5,0.000694,0,100.0,0,0,1,...,1,0,0,0,0,0,0,1,0,0


In [13]:
# Summary statistics of final dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,82965.0,62.355897,16.751005,16.0,52.0,65.0,75.0,89.0
bmi,82965.0,29.358953,8.315228,14.844926,23.768204,27.844601,33.145212,67.81499
elective_surgery,82965.0,0.188706,0.391277,0.0,0.0,0.0,0.0,1.0
height,82965.0,169.842772,10.74236,137.2,162.6,170.1,177.8,195.59
pre_icu_los_days,82965.0,0.854556,2.528387,-0.244444,0.038889,0.14375,0.415278,159.090972
readmission_status,82965.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
weight,82965.0,84.682666,25.065473,38.6,67.6,81.11,97.8,186.0
hospital_death,82965.0,0.082625,0.275317,0.0,0.0,0.0,0.0,1.0
Asian,82965.0,0.012764,0.112257,0.0,0.0,0.0,0.0,1.0
Caucasian,82965.0,0.779317,0.41471,0.0,1.0,1.0,1.0,1.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82965 entries, 0 to 82964
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 82965 non-null  float64
 1   bmi                 82965 non-null  float64
 2   elective_surgery    82965 non-null  int64  
 3   height              82965 non-null  float64
 4   pre_icu_los_days    82965 non-null  float64
 5   readmission_status  82965 non-null  int64  
 6   weight              82965 non-null  float64
 7   hospital_death      82965 non-null  int64  
 8   Asian               82965 non-null  uint8  
 9   Caucasian           82965 non-null  uint8  
 10  Hispanic            82965 non-null  uint8  
 11  Native American     82965 non-null  uint8  
 12  Other/Unknown       82965 non-null  uint8  
 13  M                   82965 non-null  uint8  
 14  readmit             82965 non-null  uint8  
 15  transfer            82965 non-null  uint8  
 16  CSIC

In [15]:
# Save dataset for usage
df.to_pickle('./dataset/data.pkl')