<br>
## Feature importance with Random Forest<br>
Building a random forest model for feature importance. Should produce similar results to a for statistical test for inference, but displays data in a way that is easier to interpret without background knowledge of P-values, F-tests, etc.

In [24]:
#load libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display #displays full dataframe columns
#display all dataframe columns when printed
pd.options.display.max_columns = None

In [None]:
#load data
df = pd.read_csv('C:/Users/Mark.Burghart/Documents/projects/hospice_carepoint/data/transformed/carepoint_transformed_dummied.csv', index_col=0)
df.shape

(271541, 120)

In [None]:
#separate variables (X) from outcome of interest (y)
df.shape
cols = df.columns.get_values() #converts column names to list
cols = cols.tolist()
feature_cols = [x for x in cols if x != 'death_within_7_days'] #removes outcome of interest from list ('death_within_7_days')

feat_labels = feature_cols

#extract rows
#print(feature_cols) #debug
X = df.loc[:, feature_cols]
X.shape #outcome column has been removed

#save outcome variable as y
y = df.death_within_7_days
y.shape

#separate data into training/test (aka holdout) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 23) #random_state for reproducibility (if needed)

In [None]:
#impute missing values: replacing NaNs with Median Column value for each column
X_train_med = X_train.fillna(X_train.median()) 
y_train_med = y_train.fillna(y_train.median())

In [None]:
feat_labels

In [None]:
%%time
# random forest with 500 trees for MEDIAN IMPUTATION dataset
forest_med = RandomForestClassifier(n_estimators = 500, random_state = 1, n_jobs= -1) #parallize across available cores
forest_med.fit(X_train_med, y_train_med)

In [None]:
importances = forest_med.feature_importances_
indices = np.argsort(importances) [::-1]

#for top k features for plot below
top_k = 10
new_indices = indices[:top_k]

print("Feature ranking:")
for f in range(X_train_med.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                           feat_labels[indices[f]],
                           importances[indices[f]]))

In [None]:
#print out just top K features
print("Top 10 Feature Ranking:")

for f in range(top_k):
    print("%d. %-*s %f" % (f + 1, 30, feat_labels[new_indices[f]], importances[new_indices[f]]))


In [None]:
#Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(top_k), importances[new_indices],
       color="b", align="center")

#Edited here (put top_k in range)
plt.xticks(range(top_k), feat_labels, rotation = 90)
#Edited here (put top_k)
plt.xlim([-1, top_k])
plt.show()