In [5]:
import pandas as pd
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

data = pd.read_csv('E:/waterloo_documents/Health_Informatics/training_data_count.csv')

print('Shape of data is: ',data.shape)
print('\nColumns are: ',data.columns)

Shape of data is:  (61532, 42)

Columns are:  Index(['icustay_id', 'hadm_id', 'age', 'gcs', 'urineoutput', 'heartrate_max',
       'heartrate_min', 'heartrate_mean', 'meanbp_max ', 'meanbp_min',
       'meanbp_mean', 'resprate_max', 'resprate_min', 'resprate_mean',
       'tempc_max', 'tempc_min', 'tempc_mean', 'mechvent', 'wbc_max',
       'wbc_min', 'wbc_mean', 'glucose_max', 'glucose_min', 'glucose_mean',
       'sodium_max', 'sodium_min', 'sodium_mean', 'potassium_max',
       'potassium_min', 'potassium_mean', 'bicarbonate_max', 'bicarbonate_min',
       'bicarbonate_mean', 'evaluation_and_management', 'surgery', 'radiology',
       'anesthesia', 'emerging_technology', 'pathology_and_laboratory',
       'performance_measurement', 'medicine', 'oasis'],
      dtype='object')


In [4]:
data.head()

Unnamed: 0,icustay_id,hadm_id,age,gcs,urineoutput,heartrate_max,heartrate_min,heartrate_mean,meanbp_max,meanbp_min,...,bicarbonate_mean,evaluation_and_management,surgery,radiology,anesthesia,emerging_technology,pathology_and_laboratory,performance_measurement,medicine,oasis
0,200016,117458,67,15.0,1275.0,91.0,59.0,67.185185,104.0,67.0,...,22.0,,,,,,,,,17
1,200033,198650,67,14.0,2300.0,100.0,60.0,73.26087,98.0,72.0,...,24.0,26.0,6.0,1.0,0.0,0.0,0.0,0.0,11.0,24
2,200055,147080,31,14.0,,101.0,72.0,86.875,85.0,53.0,...,29.5,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,20
3,200060,158405,0,,76.0,158.0,107.0,123.782609,,,...,,,,,,,,,,25
4,200099,175374,79,14.0,955.0,93.0,68.0,82.65625,102.0,50.0,...,22.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,24


In [4]:
#drop the rows having null values
data.dropna(inplace = True)
print('Shape of data after removing null values: ',data.shape)

Shape of data after removing null values:  (42490, 42)


In [5]:
# Labels are the values we want to predict
labels = data['oasis']

#drop the label from the training data
data= data.drop('oasis', axis = 1)
feature_list = list(data.columns)

print("Feature list is : \n", feature_list)
features = np.array(data)
print('\nFeature Shape is: ',features.shape)

Feature list is : 
 ['icustay_id', 'hadm_id', 'age', 'gcs', 'urineoutput', 'heartrate_max', 'heartrate_min', 'heartrate_mean', 'meanbp_max ', 'meanbp_min', 'meanbp_mean', 'resprate_max', 'resprate_min', 'resprate_mean', 'tempc_max', 'tempc_min', 'tempc_mean', 'mechvent', 'wbc_max', 'wbc_min', 'wbc_mean', 'glucose_max', 'glucose_min', 'glucose_mean', 'sodium_max', 'sodium_min', 'sodium_mean', 'potassium_max', 'potassium_min', 'potassium_mean', 'bicarbonate_max', 'bicarbonate_min', 'bicarbonate_mean', 'evaluation_and_management', 'surgery', 'radiology', 'anesthesia', 'emerging_technology', 'pathology_and_laboratory', 'performance_measurement', 'medicine']

Feature Shape is:  (42490, 41)


In [6]:
#split the data into training and testing data

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


Training Features Shape: (31867, 41)
Training Labels Shape: (31867,)
Testing Features Shape: (10623, 41)
Testing Labels Shape: (10623,)


In [7]:
#create a Random Forest regressor model
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);

#make predictions on the test data
predictions = rf.predict(test_features)


In [8]:
print(predictions)

[39.153 40.279 31.106 ... 24.928 35.48  38.631]


In [9]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2.47


In [10]:
# Calculate and display accuracy

mape = 100 * (errors / test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 91.25 %.


In [11]:
df = pd.DataFrame(test_features, columns = ['icustay_id', 'hadm_id', 'age', 'gcs', 'urineoutput', 'heartrate_max', 'heartrate_min', 'heartrate_mean', 'meanbp_max ', 'meanbp_min', 'meanbp_mean', 'resprate_max', 'resprate_min', 'resprate_mean', 'tempc_max', 'tempc_min', 'tempc_mean', 'mechvent', 'wbc_max', 'wbc_min', 'wbc_mean', 'glucose_max', 'glucose_min', 'glucose_mean', 'sodium_max', 'sodium_min', 'sodium_mean', 'potassium_max', 'potassium_min', 'potassium_mean', 'bicarbonate_max', 'bicarbonate_min', 'bicarbonate_mean', 'evaluation_and_management', 'surgery', 'radiology', 'anesthesia', 'emerging_technology', 'pathology_and_laboratory', 'performance_measurement', 'medicine'])
columns = [ 'age', 'gcs', 'urineoutput', 'heartrate_max', 'heartrate_min', 'heartrate_mean', 'meanbp_max ', 'meanbp_min', 'meanbp_mean', 'resprate_max', 'resprate_min', 'resprate_mean', 'tempc_max', 'tempc_min', 'tempc_mean', 'mechvent', 'wbc_max', 'wbc_min', 'wbc_mean', 'glucose_max', 'glucose_min', 'glucose_mean', 'sodium_max', 'sodium_min', 'sodium_mean', 'potassium_max', 'potassium_min', 'potassium_mean', 'bicarbonate_max', 'bicarbonate_min', 'bicarbonate_mean', 'evaluation_and_management', 'surgery', 'radiology', 'anesthesia', 'emerging_technology', 'pathology_and_laboratory', 'performance_measurement', 'medicine']
df.drop(columns, inplace = True, axis = 1)

#added the scores predicted in dataframe
df['predictions'] = predictions

#The mean value of the scores of all the patients
mean = 29.650328284469868

#Calculated the number of nurses based on the mean value
df['nurses'] = df['predictions']/mean

print(df.head())

#save the data in a csv file
df.to_csv('test_data.csv')

   icustay_id   hadm_id  predictions    nurses
0    216996.0  188764.0       39.153  1.320491
1    276434.0  162146.0       40.279  1.358467
2    247575.0  188204.0       31.106  1.049095
3    256155.0  110299.0       29.383  0.990984
4    285109.0  162397.0       37.945  1.279750
