In [1]:
# Code Dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# ML Dependencies
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [4]:
# Read the csv file into a pandas DataFrame
labs = pd.read_csv('../Resources/labsNew.csv')
labs.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,Albumin,Anion Gap,Bicarbonate,"Bilirubin, Total",Chloride,"Chloride, Whole Blood",Creatinine,Glucose_Blood_Gas,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,2,163353.0,0.0,0.0,0.0,9.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.1
1,3,145834.0,1.8,17.0,25.0,0.8,99.0,114.0,3.2,265.0,...,14.8,125.7,4.8,179.0,5.4,3.7,136.0,139.0,36.0,15.1
2,4,185777.0,2.8,17.0,24.0,2.2,97.0,0.0,0.5,0.0,...,12.3,31.3,3.2,207.0,3.1,0.0,135.0,0.0,9.0,9.7
3,5,178980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,309.0,0.0,0.0,0.0,0.0,0.0,13.9
4,6,107064.0,2.7,17.0,16.0,0.2,107.0,95.0,3.5,106.0,...,12.5,55.2,4.1,198.0,4.9,4.2,135.0,135.0,86.0,22.7


In [3]:
patientsData = pd.read_csv('../../Resources/PATIENTS.csv')

patientsData = patientsData[['SUBJECT_ID','EXPIRE_FLAG']]

patientsData.head()

Unnamed: 0,SUBJECT_ID,EXPIRE_FLAG
0,249,0
1,250,1
2,251,0
3,252,0
4,253,0


In [4]:
joinedData = labs.merge(patientsData, on='SUBJECT_ID')

joinedData.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,Albumin,Anion Gap,Bicarbonate,"Bilirubin, Total",Chloride,"Chloride, Whole Blood",Creatinine,Glucose_Blood_Gas,...,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells,EXPIRE_FLAG
0,2,163353.0,0.0,0.0,0.0,9.3,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.1,0
1,3,145834.0,1.8,17.0,25.0,0.8,99.0,114.0,3.2,265.0,...,125.7,4.8,179.0,5.4,3.7,136.0,139.0,36.0,15.1,1
2,4,185777.0,2.8,17.0,24.0,2.2,97.0,0.0,0.5,0.0,...,31.3,3.2,207.0,3.1,0.0,135.0,0.0,9.0,9.7,0
3,5,178980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,309.0,0.0,0.0,0.0,0.0,0.0,13.9,0
4,6,107064.0,2.7,17.0,16.0,0.2,107.0,95.0,3.5,106.0,...,55.2,4.1,198.0,4.9,4.2,135.0,135.0,86.0,22.7,0


In [5]:
X = joinedData.drop(['SUBJECT_ID','HADM_ID','EXPIRE_FLAG'], axis=1)
y = joinedData["EXPIRE_FLAG"]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (58112, 26)
y shape: (58112,)


In [6]:
# Create test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(f"----train & test sizes-----")
print(f"train size: {len(X_train)}, keys: {len(X_train.keys())}")
print(f"test size: {len(X_test)}, keys: {len(X_test.keys())}")

----train & test sizes-----
train size: 43584, keys: 26
test size: 14528, keys: 26


In [7]:
model = RandomForestRegressor(n_estimators=128)

In [16]:
model.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [17]:
predictions = model.predict(X_test)
predictions

array([0.625   , 0.53125 , 0.21875 , ..., 0.734375, 0.59375 , 0.296875])

In [18]:
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.1712331057645151, R2: 0.2717296224258705


In [19]:
# create a dataframe of the importance features sorted
feature_names = X_train.columns
importances = sorted(zip(model.feature_importances_, feature_names), reverse=True)
importance_df = pd.DataFrame(importances)
importance_df = importance_df.rename(columns={0:"Importance",1:"Feature"})
importance_df

Unnamed: 0,Importance,Feature
0,0.153997,Urea Nitrogen
1,0.071388,PT
2,0.056627,Glucose_Chemistry
3,0.051961,Platelet Count
4,0.047195,White Blood Cells
5,0.044877,PTT
6,0.044154,Hemoglobin_Hematology
7,0.03949,Hematocrit
8,0.037411,Phosphate
9,0.036206,Albumin


# Length of Stay Random Forest
## ---------------------------------------------

In [5]:
lived = pd.read_csv('../Resources/admissions_survived.csv')
lived.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,LENGTH_OF_STAY
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1,1.144444
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,5.496528
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1,6.768056
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1,2.856944
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1,3.534028


In [6]:
# join labs.csv with the admissions_survived.csv

lived_df = pd.merge(lived, labs, on='HADM_ID')
lived_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,...,12.4,30.1,3.7,259.0,4.4,0.0,140.0,0.0,17.0,5.1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,...,17.5,42.0,3.2,95.0,3.9,3.6,143.0,140.0,14.0,9.4
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,...,11.9,26.6,3.0,216.0,4.2,3.5,140.0,133.0,16.0,10.9
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,...,12.4,22.5,3.8,215.0,4.1,0.0,139.0,0.0,13.0,9.8
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,...,12.8,34.6,4.5,269.0,3.3,0.0,134.0,0.0,50.0,12.2


In [7]:
# Get rid of attributes we don't need and/or are messing up the model

lived_df = lived_df.drop(['ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','ROW_ID', 'SUBJECT_ID_x', 'HADM_ID', 'DIAGNOSIS', 'ADMITTIME', 'MARITAL_STATUS', 'DISCHTIME', 'DISCHARGE_LOCATION',
                         'RELIGION', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'SUBJECT_ID_y', 'LANGUAGE'], axis = 1)

In [8]:
lived_df = lived_df[['LENGTH_OF_STAY',
                     'Chloride, Whole Blood',
                     'Glucose_Chemistry',
                     'Hematocrit, Calculated',
                     'Hemoglobin_Blood_Gas',
                     'Lactate',
                     'Potassium, Whole Blood',
                     'Sodium, Whole Blood',
                     'Anion Gap',
                     'Albumin',
                     'Bicarbonate',
                     'Bilirubin, Total',
                     'Creatinine',
                     'Chloride',
                     'Glucose_Blood_Gas',
                     'Magnesium',
                     'Phosphate',
                     'Potassium',
                     'Sodium',
                     'Urea Nitrogen',
                     'Hematocrit',
                     'Hemoglobin_Hematology',
                     'Platelet Count',
                     'PTT',
                     'INR(PT)',
                     'PT',
                     'White Blood Cells'
                    ]]

In [9]:
# Encode non-numeric values

from sklearn.preprocessing import LabelEncoder

x_data = lived_df.drop(['LENGTH_OF_STAY'], axis=1)
encodedData = x_data.apply(LabelEncoder().fit_transform)
encodedData.head()

Unnamed: 0,"Chloride, Whole Blood",Glucose_Chemistry,"Hematocrit, Calculated",Hemoglobin_Blood_Gas,Lactate,"Potassium, Whole Blood","Sodium, Whole Blood",Anion Gap,Albumin,Bicarbonate,...,Potassium,Sodium,Urea Nitrogen,Hematocrit,Hemoglobin_Hematology,Platelet Count,PTT,INR(PT),PT,White Blood Cells
0,0,89,0,0,0,0,0,14,0,33,...,31,38,17,249,95,256,141,9,37,53
1,0,115,27,82,0,27,36,13,0,26,...,25,41,14,109,52,92,260,26,90,132
2,39,128,37,115,90,26,29,14,32,32,...,29,38,16,279,107,213,104,7,32,167
3,0,87,37,117,0,0,0,15,0,27,...,27,37,13,294,114,212,62,7,37,142
4,0,365,23,71,63,0,0,20,23,26,...,19,32,50,269,104,266,186,7,41,187


In [10]:
# Round the y data
lived_df['LENGTH_OF_STAY'] = lived_df['LENGTH_OF_STAY'].round()

In [11]:
# Ensure data shape is appropriate

X = encodedData
y = lived_df['LENGTH_OF_STAY'].values.reshape(-1, 1)
print(f"data shape: {X.shape}")
print(f"target shape: {y.shape}")

data shape: (52294, 26)
target shape: (52294, 1)


In [12]:
# Create test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(f"----train & test sizes-----")
print(f"train size: {len(X_train)}, keys: {len(X_train.keys())}")
print(f"test size: {len(X_test)}, keys: {len(X_test.keys())}")

----train & test sizes-----
train size: 39220, keys: 26
test size: 13074, keys: 26


In [13]:
model = RandomForestRegressor(n_estimators=128)

In [14]:
model.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:
predictions = model.predict(X_test)
predictions

array([ 5.90625  , 28.3203125,  9.125    , ...,  6.6484375, 36.546875 ,
        6.1875   ])

In [16]:
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 84.63429074777314, R2: 0.4408058159385452


In [17]:
# create a dataframe of the importance features sorted
feature_names = X_train.columns
importances = sorted(zip(model.feature_importances_, feature_names), reverse=True)
importance_df = pd.DataFrame(importances)
importance_df = importance_df.rename(columns={0:"Importance",1:"Feature"})
importance_df

Unnamed: 0,Importance,Feature
0,0.123055,"Bilirubin, Total"
1,0.103075,Phosphate
2,0.077392,Glucose_Chemistry
3,0.067639,Albumin
4,0.050545,Magnesium
5,0.05035,Glucose_Blood_Gas
6,0.048674,Platelet Count
7,0.042147,Hemoglobin_Hematology
8,0.040927,White Blood Cells
9,0.03552,Creatinine
