In [1]:
# Code Dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# ML Dependencies
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Read the csv file into a pandas DataFrame
labs = pd.read_csv('../Resources/labs.csv')
labs.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,Albumin,Anion Gap,Bicarbonate,"Bilirubin, Total",Chloride,"Chloride, Whole Blood",Creatinine,Glucose_Blood_Gas,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,2,163353.0,no test,no test,no test,normal,no test,no test,no test,no test,...,no test,no test,no test,abnormal,no test,no test,no test,no test,no test,abnormal
1,3,145834.0,abnormal,normal,normal,normal,abnormal,abnormal,abnormal,abnormal,...,abnormal,normal,abnormal,normal,abnormal,normal,normal,normal,abnormal,abnormal
2,4,185777.0,abnormal,normal,normal,abnormal,normal,no test,normal,no test,...,normal,normal,normal,normal,abnormal,no test,normal,no test,normal,delta
3,5,178980.0,no test,no test,no test,no test,no test,no test,no test,no test,...,no test,no test,no test,normal,no test,no test,no test,no test,no test,normal
4,6,107064.0,abnormal,abnormal,abnormal,normal,normal,abnormal,abnormal,abnormal,...,abnormal,abnormal,abnormal,normal,abnormal,normal,normal,normal,abnormal,normal


In [3]:
lived = pd.read_csv('../Resources/admissions_survived.csv')
lived.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,LENGTH_OF_STAY
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1,1.144444
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,5.496528
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1,6.768056
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1,2.856944
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1,3.534028


In [4]:
lived_df = pd.merge(lived,labs, on='HADM_ID')
lived_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,...,normal,normal,normal,normal,normal,no test,normal,no test,normal,normal
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,...,abnormal,abnormal,normal,abnormal,normal,normal,normal,normal,normal,delta
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,...,normal,normal,normal,normal,normal,normal,normal,abnormal,normal,normal
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,...,normal,normal,normal,normal,normal,no test,normal,no test,normal,normal
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,...,normal,normal,normal,normal,normal,no test,normal,no test,abnormal,abnormal


In [5]:
lived_df.columns

Index(['ROW_ID', 'SUBJECT_ID_x', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'LENGTH_OF_STAY', 'SUBJECT_ID_y', 'Albumin',
       'Anion Gap', 'Bicarbonate', 'Bilirubin, Total', 'Chloride',
       'Chloride, Whole Blood', 'Creatinine', 'Glucose_Blood_Gas',
       'Glucose_Chemistry', 'Hematocrit', 'Hematocrit, Calculated',
       'Hemoglobin_Blood_Gas', 'Hemoglobin_Hematology', 'INR(PT)', 'Lactate',
       'Magnesium', 'PT', 'PTT', 'Phosphate', 'Platelet Count', 'Potassium',
       'Potassium, Whole Blood', 'Sodium', 'Sodium, Whole Blood',
       'Urea Nitrogen', 'White Blood Cells'],
      dtype='object')

In [6]:
lived_df = lived_df.drop(['ROW_ID', 'SUBJECT_ID_x', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DISCHARGE_LOCATION',
                         'RELIGION', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'SUBJECT_ID_y', 'LANGUAGE'], axis = 1)

In [7]:
lived_df = pd.get_dummies(lived_df)
lived_df.head()

Unnamed: 0,LENGTH_OF_STAY,ADMISSION_TYPE_ELECTIVE,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_NEWBORN,ADMISSION_TYPE_URGENT,ADMISSION_LOCATION_** INFO NOT AVAILABLE **,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_HMO REFERRAL/SICK,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,...,"Sodium, Whole Blood_abnormal","Sodium, Whole Blood_no test","Sodium, Whole Blood_normal",Urea Nitrogen_abnormal,Urea Nitrogen_no test,Urea Nitrogen_normal,White Blood Cells_abnormal,White Blood Cells_delta,White Blood Cells_no test,White Blood Cells_normal
0,1.144444,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
1,5.496528,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,1,0,0
2,6.768056,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
3,2.856944,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,3.534028,0,1,0,0,0,0,1,0,0,...,0,1,0,1,0,0,1,0,0,0


In [8]:
# assignment and reshaping of data
data = lived_df.drop(["LENGTH_OF_STAY"], axis=1)
target = lived_df["LENGTH_OF_STAY"].values.reshape(-1, 1)

print(f"data shape: {data.shape}")
print(f"target shape: {target.shape}")

data shape: (52294, 14127)
target shape: (52294, 1)


In [9]:
# Create test and train sets
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

print(f"----train & test sizes-----")
print(f"train size: {len(X_train)}, keys: {len(X_train.keys())}")
print(f"test size: {len(X_test)}, keys: {len(X_test.keys())}")

----train & test sizes-----
train size: 39220, keys: 14127
test size: 13074, keys: 14127


In [10]:
# Modeling
model = RandomForestRegressor(n_estimators=200)

In [11]:
model.fit(X_train, y_train.ravel())  

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [12]:
# create the predictions
predictions = model.predict(X_test)
predictions

array([ 7.39842014, 28.43516319, 12.18727431, ..., 10.47728819,
       10.55703125,  6.75813889])

In [14]:
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 87.78553001155943, R2: 0.4188675617314901


In [16]:
# create a dataframe of the importance features sorted
feature_names = X_train.columns
importances = sorted(zip(model.feature_importances_, feature_names), reverse=True)
importance_df = pd.DataFrame(importances)
importance_df = importance_df.rename(columns={0:"Importance",1:"Feature"})
importance_df.to_csv('rf_lived_feature_importance')
importance_df

Unnamed: 0,Importance,Feature
0,0.111865,Phosphate_no test
1,0.056674,Albumin_abnormal
2,0.055111,DIAGNOSIS_NEWBORN
3,0.050507,ADMISSION_TYPE_NEWBORN
4,0.038768,Glucose_Blood_Gas_no test
5,0.029413,"Bilirubin, Total_no test"
6,0.017508,Anion Gap_no test
7,0.012493,DIAGNOSIS_CROHN'S DISEASE;ABDOMINAL FISTULA
8,0.011507,Urea Nitrogen_abnormal
9,0.008292,Hemoglobin_Blood_Gas_abnormal
