## Import packages

In [13]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib notebook

## Read Data from csv files

In [14]:
bill_amt = pd.read_csv('bill_amount.csv')
bill_id = pd.read_csv('bill_id.csv')
clinical_data = pd.read_csv('clinical_data.csv')
clinical_data.rename(columns = {'id':'patient_id'},inplace=True)
clinical_data.replace(['Yes','No'],[1,0],inplace=True)

demo_data = pd.read_csv('demographics.csv')
demo_data.replace(['f','m'],['Female','Male'],inplace=True)
demo_data.replace(['India','chinese'],['Indian','Chinese'],inplace=True)

## Join Dataframes 

In [15]:
df = pd.merge(bill_amt, bill_id, how = 'right', left_on = 'bill_id', right_on = 'bill_id')
df = df.groupby(['patient_id','date_of_admission']).agg({'amount':np.sum}).reset_index()
df1 = pd.merge(clinical_data, demo_data, how = 'left', left_on = ['patient_id'], right_on = ['patient_id'])
df2 = pd.merge(df,df1, how = 'right', left_on = ['patient_id', 'date_of_admission'], right_on = ['patient_id','date_of_admission'])

## Convert date columns to DateTime format and handle missing data values

In [16]:
df2['date_of_discharge'] = pd.to_datetime(df2['date_of_discharge'])
df2['date_of_admission'] = pd.to_datetime(df2['date_of_admission'])
df2['date_of_birth'] = pd.to_datetime(df2['date_of_birth'])
df2['days_in_hospital'] = pd.to_datetime(df2['date_of_discharge']).sub(pd.to_datetime(df2['date_of_admission']),axis = 0).dt.days
df2.fillna(0, inplace=True)

## Create Age column from DOB data and create age groups 

In [17]:
#age_group = ['Group1','Group2','Group3','Group4','Group5']
def age_group_fun(dl):
    if dl > 64: return 'age_group5'
    elif 54 < dl <= 64: return 'age_group4'
    elif 44 < dl <= 54: return 'age_group3'
    elif 34 < dl <= 44: return 'age_group2'
    elif 24 < dl <= 34: return 'age_group1'
    else: return 'None'
df2['age'] = (pd.to_datetime('today').year)-(df2['date_of_birth'].dt.year)
age_group = df2['age'].map(age_group_fun)


## Create Categorical Variables

In [18]:
age_group_category = pd.get_dummies(age_group)
gender = pd.get_dummies(df2['gender'])
resident_status= pd.get_dummies(df2['resident_status'])
race = pd.get_dummies(df2['race'])
df2 = pd.concat([df2,age_group_category,gender,resident_status,race],axis = 1)


## Define bill amount as Target variable and define feature variables.

In [19]:
Y_train = np.array(df2['amount'])
X_train = (df2.drop(['date_of_birth','age','date_of_admission',
           'date_of_discharge','patient_id','amount',
           'gender','race','resident_status'],axis = 1))
all_feature_names = X_train.columns
X_train = np.array(X_train)

## Define Random Forest Regression model, Train model and evaluate the important features

In [24]:
#import model
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
#instantiate the regressor and k-fold 
kfold = model_selection.KFold(n_splits=10, random_state=0)
model = RandomForestRegressor(n_estimators = 1000)
scoring = 'r2'
#Train the model and evaluate R2 evaluation score
results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print("R^2: {:.3f}".format(results.mean()))

R^2: %.3f (%.3f)


(None, (0.88717716484791498, 0.022712865620763231))

In [9]:
#Important feature variables sorted as per descending order of the importance
features = rf.feature_importances_
imp_features_index = sorted(range(len(features)), key=lambda k: features[k])
print(all_feature_names[imp_features_index])