<br>
## Feature importance with Random Forest<br>
Building a random forest model for feature importance. Should produce similar results to a for statistical test for inference, but displays data in a way that is easier to interpret without background knowledge of P-values, F-tests, etc.

In [3]:
#load libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display #displays full dataframe columns
#display all dataframe columns when printed
pd.options.display.max_columns = None

In [4]:
#load data
df = pd.read_csv('C:/Users/Mark.Burghart/Documents/projects/hospice_carepoint/data/transformed/carepoint_transformed_dummied.csv', index_col=0)
df.shape

(271541, 120)

In [8]:
#separate variables (X) from outcome of interest (y)
df.shape
cols = df.columns.get_values() #converts column names to list
cols = cols.tolist()
feature_cols = [x for x in cols if x != 'death_within_7_days'] #removes outcome of interest from list ('death_within_7_days')

feat_labels = feature_cols

#extract rows
#print(feature_cols) #debug
X = df.loc[:, feature_cols]
X.shape #outcome column has been removed

#save outcome variable as y
y = df.death_within_7_days
y.shape

#separate data into training/test (aka holdout) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 23) #random_state for reproducibility (if needed)

In [6]:
#impute missing values: replacing NaNs with Median Column value for each column
X_train_med = X_train.fillna(X_train.median()) 
y_train_med = y_train.fillna(y_train.median())

I'd like to assess two Random Forests for feature significance: One for no data imputation. One for Median imputation.

In [9]:
feat_labels

['gender',
 'Anxiety',
 'Depression',
 'Drowsiness',
 'Lack_of_Appetite',
 'Nausea',
 'Pain',
 'Shortness_of_Breath',
 'Tiredness',
 'Wellbeing',
 'LengthOfCare_days',
 '3_visit_max_anxiety',
 '3_visit_max_depression',
 '3_visit_max_drowsiness',
 '3_visit_max_lackofappetite',
 '3_visit_max_nausea',
 '3_visit_max_pain',
 '3_visit_max_shortnessofbreath',
 '3_visit_max_tiredness',
 '3_visit_max_wellbeing',
 '5_visit_max_anxiety',
 '5_visit_max_depression',
 '5_visit_max_drowsiness',
 '5_visit_max_lackofappetite',
 '5_visit_max_nausea',
 '5_visit_max_pain',
 '5_visit_max_shortnessofbreath',
 '5_visit_max_tiredness',
 '5_visit_max_wellbeing',
 '3_visit_mean_anxiety',
 '3_visit_mean_depression',
 '3_visit_mean_drowsiness',
 '3_visit_mean_lackofappetite',
 '3_visit_mean_nausea',
 '3_visit_mean_pain',
 '3_visit_mean_shortnessofbreath',
 '3_visit_mean_tiredness',
 '3_visit_mean_wellbeing',
 '5_visit_mean_anxiety',
 '5_visit_mean_depression',
 '5_visit_mean_drowsiness',
 '5_visit_mean_lackofappe

In [None]:
%%time
#random forest with 500 trees for MISSING DATA datasets
forest = RandomForestClassifier(n_estimators = 500, random_state = 1)
forest.fit(X_train, y_train)

In [None]:
importances = forest.feature_importances_
indices = np.argsort(importances) [::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                           feat_labels[indices[f]],
                           importances[indices[f]]))
plt.title('Variable Importance')
plt.bar(range(X_train.shape[1]),
       importances[indices],
       align='center')

plt.xticks(range(X_train.shape[1]),
          feat_labels, rotation = 90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()

In [None]:
%%time
# random forest with 500 trees for MEDIAN IMPUTATION dataset
forest_med = RandomForestClassifier(n_estimators = 500, random_state = 1)
forest_med.fit(X_train_med, y_train_med)

In [None]:
importances = forest_med.feature_importances_
indices = np.argsort(importances) [::-1]

for f in range(X_train_med.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                           feat_labels[indices[f]],
                           importances[indices[f]]))
plt.title('Variable Importance')
plt.bar(range(X_train_med.shape[1]),
       importances[indices],
       align='center')

plt.xticks(range(X_train_med.shape[1]),
          feat_labels, rotation = 90)
plt.xlim([-1, X_train_med.shape[1]])
plt.tight_layout()
plt.show()