In [1]:
# Code Dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# ML Dependencies
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Read the csv file into a pandas DataFrame
labs = pd.read_csv('../Resources/labs.csv')
labs.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,Albumin,Anion Gap,Bicarbonate,"Bilirubin, Total",Chloride,"Chloride, Whole Blood",Creatinine,Glucose_Blood_Gas,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,2,163353.0,no test,no test,no test,normal,no test,no test,no test,no test,...,no test,no test,no test,abnormal,no test,no test,no test,no test,no test,abnormal
1,3,145834.0,abnormal,normal,normal,normal,abnormal,abnormal,abnormal,abnormal,...,abnormal,normal,abnormal,normal,abnormal,normal,normal,normal,abnormal,abnormal
2,4,185777.0,abnormal,normal,normal,abnormal,normal,no test,normal,no test,...,normal,normal,normal,normal,abnormal,no test,normal,no test,normal,delta
3,5,178980.0,no test,no test,no test,no test,no test,no test,no test,no test,...,no test,no test,no test,normal,no test,no test,no test,no test,no test,normal
4,6,107064.0,abnormal,abnormal,abnormal,normal,normal,abnormal,abnormal,abnormal,...,abnormal,abnormal,abnormal,normal,abnormal,normal,normal,normal,abnormal,normal


In [3]:
died = pd.read_csv('../Resources/admissions_died.csv')
died.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,DAYS_TO_DEATH
0,30,31,128652,2108-08-22 23:27:00,2108-08-30 15:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,,CATHOLIC,MARRIED,WHITE,,,STATUS EPILEPTICUS,1,1,7.647917
1,55,56,181711,2104-01-02 02:01:00,2104-01-08 10:30:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,NOT SPECIFIED,,WHITE,2104-01-01 23:59:00,2104-01-02 03:33:00,HEAD BLEED,1,1,6.353472
2,61,61,189535,2119-01-04 18:12:00,2119-02-03 01:35:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Private,,CATHOLIC,MARRIED,WHITE,,,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA,1,1,29.307639
3,68,67,155252,2157-12-02 00:45:00,2157-12-02 03:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,JEWISH,SINGLE,WHITE,2157-12-01 20:45:00,2157-12-02 00:55:00,SUBARACHNOID HEMORRHAGE,1,1,0.131944
4,86,84,166401,2196-04-14 04:02:00,2196-04-17 13:42:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Private,,OTHER,MARRIED,WHITE,2196-04-13 22:23:00,2196-04-14 04:31:00,"GLIOBLASTOMA,NAUSEA",1,1,3.402778


In [4]:
died_df = pd.merge(died,labs, on='HADM_ID')
died_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID_x,HADM_ID,ADMITTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,30,31,128652,2108-08-22 23:27:00,2108-08-30 15:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,,...,normal,normal,normal,abnormal,normal,normal,abnormal,no test,normal,normal
1,55,56,181711,2104-01-02 02:01:00,2104-01-08 10:30:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,...,normal,normal,normal,normal,normal,abnormal,abnormal,normal,abnormal,normal
2,61,61,189535,2119-01-04 18:12:00,2119-02-03 01:35:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Private,,...,normal,normal,normal,abnormal,normal,no test,normal,no test,abnormal,abnormal
3,68,67,155252,2157-12-02 00:45:00,2157-12-02 03:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,...,normal,abnormal,no test,normal,normal,no test,normal,no test,abnormal,normal
4,86,84,166401,2196-04-14 04:02:00,2196-04-17 13:42:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Private,,...,no test,no test,normal,normal,normal,no test,normal,no test,normal,normal


In [5]:
died_df.columns

Index(['ROW_ID', 'SUBJECT_ID_x', 'HADM_ID', 'ADMITTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'DAYS_TO_DEATH', 'SUBJECT_ID_y', 'Albumin',
       'Anion Gap', 'Bicarbonate', 'Bilirubin, Total', 'Chloride',
       'Chloride, Whole Blood', 'Creatinine', 'Glucose_Blood_Gas',
       'Glucose_Chemistry', 'Hematocrit', 'Hematocrit, Calculated',
       'Hemoglobin_Blood_Gas', 'Hemoglobin_Hematology', 'INR(PT)', 'Lactate',
       'Magnesium', 'PT', 'PTT', 'Phosphate', 'Platelet Count', 'Potassium',
       'Potassium, Whole Blood', 'Sodium', 'Sodium, Whole Blood',
       'Urea Nitrogen', 'White Blood Cells'],
      dtype='object')

In [6]:
died_df = died_df.drop(['ROW_ID', 'SUBJECT_ID_x', 'SUBJECT_ID_y', 'HADM_ID', 'ADMITTIME', 'DEATHTIME', 'DISCHARGE_LOCATION',
                         'RELIGION', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'LANGUAGE'], axis = 1)

In [7]:
died_df = pd.get_dummies(died_df)
died_df.head()

Unnamed: 0,DAYS_TO_DEATH,ADMISSION_TYPE_ELECTIVE,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_NEWBORN,ADMISSION_TYPE_URGENT,ADMISSION_LOCATION_** INFO NOT AVAILABLE **,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_HMO REFERRAL/SICK,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,...,"Sodium, Whole Blood_abnormal","Sodium, Whole Blood_no test","Sodium, Whole Blood_normal",Urea Nitrogen_abnormal,Urea Nitrogen_no test,Urea Nitrogen_normal,White Blood Cells_abnormal,White Blood Cells_delta,White Blood Cells_no test,White Blood Cells_normal
0,7.647917,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
1,6.353472,0,1,0,0,0,0,1,0,0,...,0,0,1,1,0,0,0,0,0,1
2,29.307639,0,1,0,0,0,1,0,0,0,...,0,1,0,1,0,0,1,0,0,0
3,0.131944,0,1,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,1
4,3.402778,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1


In [8]:
# assignment and reshaping of data
data = died_df.drop(["DAYS_TO_DEATH"], axis=1)
target = died_df["DAYS_TO_DEATH"].values.reshape(-1, 1)

print(f"data shape: {data.shape}")
print(f"target shape: {target.shape}")

data shape: (5818, 2440)
target shape: (5818, 1)


In [9]:
# Create test and train sets
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

print(f"----train & test sizes-----")
print(f"train size: {len(X_train)}, keys: {len(X_train.keys())}")
print(f"test size: {len(X_test)}, keys: {len(X_test.keys())}")

----train & test sizes-----
train size: 4363, keys: 2440
test size: 1455, keys: 2440


In [11]:
# Modeling
model = RandomForestRegressor(n_estimators=200)

In [12]:
model.fit(X_train, y_train.ravel()) # Ravel used to format array for fit() function

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [13]:
# create the predictions
predictions = model.predict(X_test)
predictions

array([ 9.87752431, 12.37063542,  3.83907986, ...,  2.91978848,
        3.16580903,  7.31902778])

In [15]:
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 181.21967169134743, R2: 0.07535713505273511


In [16]:
# create a dataframe of the importance features sorted
feature_names = X_train.columns
importances = sorted(zip(model.feature_importances_, feature_names), reverse=True)
importance_df = pd.DataFrame(importances)
importance_df = importance_df.rename(columns={0:"Importance",1:"Feature"})
importance_df

Unnamed: 0,Importance,Feature
0,0.074978,Albumin_no test
1,0.033155,DIAGNOSIS_ASPIRATION; FAILURE TO THRIVE
2,0.029843,Hemoglobin_Blood_Gas_abnormal
3,0.029362,DIAGNOSIS_MULTIPLE MYELOMA\BONE MARROW TRANSPLANT
4,0.022710,DIAGNOSIS_AMC;FEVER
5,0.019331,DIAGNOSIS_APLASTIC ANEMIA;PANCYTOPENIA
6,0.017339,Lactate_normal
7,0.016430,DIAGNOSIS_ANEMIA
8,0.015106,DIAGNOSIS_MULTIPLE MYELOMA;FEVER;NEUTROPENIA
9,0.013342,DIAGNOSIS_HEPATACELLULAR CARCINOMA/SDA
