In [1]:
# Code Dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# ML Dependencies
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Read the csv file into a pandas DataFrame
labs = pd.read_csv('../Resources/labsNew.csv')
labs.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,Albumin,Anion Gap,Bicarbonate,"Bilirubin, Total",Chloride,"Chloride, Whole Blood",Creatinine,Glucose_Blood_Gas,...,PT,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells
0,2,163353.0,0.0,0.0,0.0,9.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.1
1,3,145834.0,1.8,17.0,25.0,0.8,99.0,114.0,3.2,265.0,...,14.8,125.7,4.8,179.0,5.4,3.7,136.0,139.0,36.0,15.1
2,4,185777.0,2.8,17.0,24.0,2.2,97.0,0.0,0.5,0.0,...,12.3,31.3,3.2,207.0,3.1,0.0,135.0,0.0,9.0,9.7
3,5,178980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,309.0,0.0,0.0,0.0,0.0,0.0,13.9
4,6,107064.0,2.7,17.0,16.0,0.2,107.0,95.0,3.5,106.0,...,12.5,55.2,4.1,198.0,4.9,4.2,135.0,135.0,86.0,22.7


In [3]:
patientsData = pd.read_csv('../../Resources/PATIENTS.csv')

patientsData = patientsData[['SUBJECT_ID','EXPIRE_FLAG']]

patientsData.head()

Unnamed: 0,SUBJECT_ID,EXPIRE_FLAG
0,249,0
1,250,1
2,251,0
3,252,0
4,253,0


In [5]:
joinedData = labs.merge(patientsData, on='SUBJECT_ID')

joinedData.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,Albumin,Anion Gap,Bicarbonate,"Bilirubin, Total",Chloride,"Chloride, Whole Blood",Creatinine,Glucose_Blood_Gas,...,PTT,Phosphate,Platelet Count,Potassium,"Potassium, Whole Blood",Sodium,"Sodium, Whole Blood",Urea Nitrogen,White Blood Cells,EXPIRE_FLAG
0,2,163353.0,0.0,0.0,0.0,9.3,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.1,0
1,3,145834.0,1.8,17.0,25.0,0.8,99.0,114.0,3.2,265.0,...,125.7,4.8,179.0,5.4,3.7,136.0,139.0,36.0,15.1,1
2,4,185777.0,2.8,17.0,24.0,2.2,97.0,0.0,0.5,0.0,...,31.3,3.2,207.0,3.1,0.0,135.0,0.0,9.0,9.7,0
3,5,178980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,309.0,0.0,0.0,0.0,0.0,0.0,13.9,0
4,6,107064.0,2.7,17.0,16.0,0.2,107.0,95.0,3.5,106.0,...,55.2,4.1,198.0,4.9,4.2,135.0,135.0,86.0,22.7,0


In [7]:
X = joinedData.drop(['SUBJECT_ID','HADM_ID','EXPIRE_FLAG'], axis=1)
y = joinedData["EXPIRE_FLAG"]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (58112, 26)
y shape: (58112,)


In [8]:
# Create test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(f"----train & test sizes-----")
print(f"train size: {len(X_train)}, keys: {len(X_train.keys())}")
print(f"test size: {len(X_test)}, keys: {len(X_test.keys())}")

----train & test sizes-----
train size: 43584, keys: 26
test size: 14528, keys: 26


In [15]:
model = RandomForestRegressor(n_estimators=128)

In [16]:
model.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [17]:
predictions = model.predict(X_test)
predictions

array([0.625   , 0.53125 , 0.21875 , ..., 0.734375, 0.59375 , 0.296875])

In [18]:
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.1712331057645151, R2: 0.2717296224258705


In [19]:
# create a dataframe of the importance features sorted
feature_names = X_train.columns
importances = sorted(zip(model.feature_importances_, feature_names), reverse=True)
importance_df = pd.DataFrame(importances)
importance_df = importance_df.rename(columns={0:"Importance",1:"Feature"})
importance_df

Unnamed: 0,Importance,Feature
0,0.153997,Urea Nitrogen
1,0.071388,PT
2,0.056627,Glucose_Chemistry
3,0.051961,Platelet Count
4,0.047195,White Blood Cells
5,0.044877,PTT
6,0.044154,Hemoglobin_Hematology
7,0.03949,Hematocrit
8,0.037411,Phosphate
9,0.036206,Albumin
