# Random Forest Model - Days to Death
-------
#### This random forest model will utilize the lab tests from patients who died while in the ICU. Using these lab tests, it will attempt to predict the number of days before death a patient has using their first blood tests upon admission to the ICU. It will also be used to determine feature importance.

In [1]:
# Code Dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# ML Dependencies
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Data Pre-Processing
------------
#### Below is the process for combining the CSVs containing our desired information, and data cleaning so that it may be used in the random forest model

In [2]:
# Read the csv file into a pandas DataFrame
labs = pd.read_csv('../Resources/labsNew.csv')
labs.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,Albumin,Anion Gap,Bicarbonate,"Bilirubin, Total",Chloride,"Chloride, Whole Blood",Creatinine,Glucose_Blood_Gas,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,2,163353.0,0.0,0.0,0.0,9.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.1
1,3,145834.0,1.8,17.0,25.0,0.8,99.0,114.0,3.2,265.0,...,14.8,125.7,4.8,179.0,5.4,3.7,136.0,139.0,36.0,15.1
2,4,185777.0,2.8,17.0,24.0,2.2,97.0,0.0,0.5,0.0,...,12.3,31.3,3.2,207.0,3.1,0.0,135.0,0.0,9.0,9.7
3,5,178980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,309.0,0.0,0.0,0.0,0.0,0.0,13.9
4,6,107064.0,2.7,17.0,16.0,0.2,107.0,95.0,3.5,106.0,...,12.5,55.2,4.1,198.0,4.9,4.2,135.0,135.0,86.0,22.7


In [3]:
# Import the admissions_died.csv
died = pd.read_csv('../Resources/admissions_died.csv')

# Display the dataframe
died.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,DAYS_TO_DEATH
0,30,31,128652,2108-08-22 23:27:00,2108-08-30 15:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,,CATHOLIC,MARRIED,WHITE,,,STATUS EPILEPTICUS,1,1,7.647917
1,55,56,181711,2104-01-02 02:01:00,2104-01-08 10:30:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,NOT SPECIFIED,,WHITE,2104-01-01 23:59:00,2104-01-02 03:33:00,HEAD BLEED,1,1,6.353472
2,61,61,189535,2119-01-04 18:12:00,2119-02-03 01:35:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Private,,CATHOLIC,MARRIED,WHITE,,,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA,1,1,29.307639
3,68,67,155252,2157-12-02 00:45:00,2157-12-02 03:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,JEWISH,SINGLE,WHITE,2157-12-01 20:45:00,2157-12-02 00:55:00,SUBARACHNOID HEMORRHAGE,1,1,0.131944
4,86,84,166401,2196-04-14 04:02:00,2196-04-17 13:42:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Private,,OTHER,MARRIED,WHITE,2196-04-13 22:23:00,2196-04-14 04:31:00,"GLIOBLASTOMA,NAUSEA",1,1,3.402778


In [4]:
# Merge the dataframes together
died_df = pd.merge(died, labs, on='HADM_ID')

# Display the joined dataframes
died_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID_x,HADM_ID,ADMITTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,30,31,128652,2108-08-22 23:27:00,2108-08-30 15:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,,...,13.2,29.4,2.7,109.0,3.3,4.2,128.0,0.0,13.0,6.9
1,55,56,181711,2104-01-02 02:01:00,2104-01-08 10:30:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,...,13.0,27.0,3.9,210.0,4.0,3.4,128.0,138.0,21.0,10.0
2,61,61,189535,2119-01-04 18:12:00,2119-02-03 01:35:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Private,,...,11.1,28.7,2.7,21.0,3.3,0.0,139.0,0.0,17.0,0.1
3,68,67,155252,2157-12-02 00:45:00,2157-12-02 03:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,...,13.2,35.5,0.0,183.0,5.1,0.0,137.0,0.0,24.0,9.3
4,86,84,166401,2196-04-14 04:02:00,2196-04-17 13:42:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Private,,...,0.0,0.0,3.4,231.0,2.5,0.0,127.0,0.0,6.0,11.6


In [6]:
# Specify the exact columns for use in the random forest model. The twenty-six columns beyond DAYS_TO_DEATH are
# the particular lab values of interest in this model. To eliminate a variable from the model, simply comment out
# that variable.
died_df = died_df[['DAYS_TO_DEATH',
                     'Chloride, Whole Blood',
                     'Glucose_Chemistry',
                     'Hematocrit, Calculated',
                     'Hemoglobin_Blood_Gas',
                     'Lactate',
                     'Potassium, Whole Blood',
                     'Sodium, Whole Blood',
                     'Anion Gap',
                     'Albumin',
                     'Bicarbonate',
                     'Bilirubin, Total',
                     'Creatinine',
                     'Chloride',
                     'Glucose_Blood_Gas',
                     'Magnesium',
                     'Phosphate',
                     'Potassium',
                     'Sodium',
                     'Urea Nitrogen',
                     'Hematocrit',
                     'Hemoglobin_Hematology',
                     'Platelet Count',
                     'PTT',
                     'INR(PT)',
                     'PT',
                     'White Blood Cells'
                    ]]

In [9]:
# Round the y data
died_df['DAYS_OF_DEATH'] = died_df['DAYS_TO_DEATH'].round()

In [11]:
# Set the X and y variables

X = died_df.drop(['DAYS_TO_DEATH'], axis=1)
y = died_df['DAYS_TO_DEATH'].values.reshape(-1, 1)

# Check to ensure data shape is appropriate
print(f"data shape: {X.shape}")
print(f"target shape: {y.shape}")

data shape: (5818, 26)
target shape: (5818, 1)


# Train Test Split
---------
#### Below splits the above data into train and test groups for training the neural network

In [12]:
# Create test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(f"----train & test sizes-----")
print(f"train size: {len(X_train)}, keys: {len(X_train.keys())}")
print(f"test size: {len(X_test)}, keys: {len(X_test.keys())}")

----train & test sizes-----
train size: 4363, keys: 26
test size: 1455, keys: 26


# Model Construction
-------
#### Below is the creation and fitting of the random forest model, which will use 128 estimators.

In [13]:
# Set the model to a variable and specify the number of estimators
model = RandomForestRegressor(n_estimators=128)

In [14]:
# Fit the model
model.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

# Model Evaluation
-------
#### Below is the evaluation of the fitted model, to determine how successful it is in predicting days to death, and to determine the most important features in the model

In [15]:
# Display an example of the model predictions
predictions = model.predict(X_test)
predictions

array([13.76462131,  5.52790663,  8.75952691, ...,  1.03778754,
        2.39822591, 11.35098199])

In [16]:
# Calculate and print the MSE and r2 values to evaluate the model
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 153.60040282999847, R2: 0.21627980448126505


In [17]:
# Create a dataframe of the importance features sorted
feature_names = X_train.columns
importances = sorted(zip(model.feature_importances_, feature_names), reverse=True)
importance_df = pd.DataFrame(importances)
importance_df = importance_df.rename(columns={0:"Importance",1:"Feature"})
importance_df

Unnamed: 0,Importance,Feature
0,0.118129,Albumin
1,0.06665,Platelet Count
2,0.058934,Hematocrit
3,0.05627,White Blood Cells
4,0.051712,Lactate
5,0.049381,Anion Gap
6,0.047475,Glucose_Chemistry
7,0.046119,"Bilirubin, Total"
8,0.043333,Urea Nitrogen
9,0.041665,"Potassium, Whole Blood"
