In [36]:
%matplotlib inline

#load libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from IPython.display import display #displays full dataframe columns
#display all dataframe columns when printed
pd.options.display.max_columns = None

In [18]:
#load data
df = pd.read_csv('C:/Users/Mark.Burghart/Documents/projects/hospice_carepoint/data/transformed/carepoint_transformed_dummied.csv', index_col=0)
df.shape

(271541, 120)

In [19]:
#separate variables (X) from outcome of interest (y)
df.shape
cols = df.columns.get_values() #converts column names to list
cols = cols.tolist()
feature_cols = [x for x in cols if x != 'death_within_7_days'] #removes outcome of interest from list ('death_within_7_days')

#extract rows
#print(feature_cols) #debug
X = df.loc[:, feature_cols]
X.shape #outcome column has been removed

(271541, 119)

In [20]:
#save outcome variable as y
y = df.death_within_7_days
y.shape

(271541,)

In [21]:
#separate data into training/test (aka holdout) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 99) #random_state for reproducibility

#create objects with specific imputation method
#impute missing values: replacing NaNs with Median Column value for each column
X_train_med = X_train.fillna(X_train.median()) 
y_train_med = y_train.fillna(y_train.median())


In [27]:
#random forest feature selection for top 40 variables
def selectKImportance(model, X, k=40):
     return X.iloc[:,model.feature_importances_.argsort()[::-1][:k]]

In [30]:
%%time
model = RandomForestClassifier(n_estimators = 500, random_state = 1, n_jobs= -1)
model.fit(X_train_med,y_train_med)


newX = selectKImportance(model,X_train_med, 40)

Wall time: 1min 19s


In [31]:
newX.head()

Unnamed: 0_level_0,Lack_of_Appetite,Age,LengthOfCare_days,Drowsiness,Tiredness,3_visit_max_lackofappetite,ESAS_visit_total,Wellbeing,3_visit_mean_lackofappetite,Shortness_of_Breath,3_visit_max_drowsiness,3_visit_max_tiredness,3_visit_mean_tiredness,Depression,3_visit_mean_drowsiness,Anxiety,5_visit_max_lackofappetite,5_visit_mean_lackofappetite,Pain,LevelofCare_Inpatient (GIP),LackofAppetite_change,3_visit_mean_pain,5_visit_mean_tiredness,ShortnessofBreath_change,Pain_change,5_visit_mean_pain,5_visit_mean_drowsiness,3_visit_mean_shortnessofbreath,Drowsiness_change,Tiredness_change,3_visit_max_pain,LevelofCare_Routine,3_visit_mean_esas,3_visit_mean_wellbeing,ESAS_change,3_visit_max_shortnessofbreath,5_visit_mean_shortnessofbreath,3_visit_max_esas,5_visit_max_pain,5_visit_max_drowsiness
s,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
13884,4.0,96.910956,110,7.0,5.0,5.0,24.0,5.0,4.0,0.0,7.0,6.0,5.0,1.0,4.333333,0.0,5.0,4.0,0.0,0,0.0,1.0,5.0,0.0,-3.0,0.6,5.0,0.0,7.0,0.0,3.0,1,23.333333,5.0,0.0,0.0,0.0,27.0,3.0,7.0
192375,4.0,93.118955,1,8.0,5.0,5.0,39.0,8.0,4.0,8.0,5.0,6.0,5.0,2.0,3.0,4.0,5.0,4.0,0.0,0,2.0,0.666667,5.0,5.0,0.0,0.8,3.0,1.0,5.0,0.0,2.0,1,23.333333,5.0,18.0,2.0,1.2,27.0,2.0,5.0
25251,1.0,94.69599,13,5.0,4.0,2.0,27.0,5.0,1.333333,4.0,5.0,4.0,4.0,6.0,5.0,2.0,2.0,1.6,0.0,0,0.0,0.0,4.0,0.0,0.0,0.0,5.0,4.666667,0.0,0.0,0.0,1,28.0,5.0,0.0,6.0,5.2,30.0,0.0,5.0
241192,9.0,55.741049,14,3.0,5.0,5.0,24.0,5.0,4.0,0.0,5.0,6.0,5.0,1.0,3.0,0.0,5.0,4.0,0.0,0,0.0,1.666667,5.0,0.0,-5.0,1.0,3.0,0.0,0.0,0.0,5.0,1,23.333333,5.0,0.0,0.0,0.2,27.0,5.0,5.0
144943,1.0,77.868813,57,0.0,3.0,1.0,20.0,5.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,6.0,3.0,1.8,5.0,0,0.0,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,1,20.0,5.0,0.0,0.0,0.0,20.0,5.0,0.0


In [32]:
%%time
#logistic regression model optimized from grid search
clf = LogisticRegression(penalty = 'l1', random_state = 30, C = 100, class_weight='balanced', n_jobs = -1) #balances classes based on frequency. Helpful for unbalanced classes, like this.

# Now fit the model on median imputation data
clf.fit(newX, y_train_med)

  " = {}.".format(self.n_jobs))


Wall time: 29.6 s


In [38]:

y_pred = cross_val_predict(clf, newX, y_train_med, cv = 10)
conf_matrix_med = metrics.confusion_matrix(y_train_med, y_pred)

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


In [41]:
conf_matrix_med = metrics.confusion_matrix(y_train_med, y_pred)
print(conf_matrix_med)


[[110065  34841]
 [ 14435  30737]]


In [42]:
#model coefficients
print("intercept: ")
print(clf.intercept_)

print("\nCoefficients: ")
print(clf.coef_[0].transpose())

intercept: 
[-1.86936277]

Coefficients: 
[ 0.17713789 -0.00126859 -0.00498362  0.12660198  0.04892874  0.1250438
 -0.01120751  0.0810215  -0.07395165  0.08885137  0.1880057  -0.09333166
  0.1129622  -0.14305066 -0.18585179  0.01170255  0.04825    -0.05932982
 -0.01535649  1.1320527  -0.05360337 -0.08932828  0.00696595 -0.01756443
  0.00954881 -0.08203134 -0.05456966 -0.29078119 -0.04846668 -0.00657867
  0.12050158 -0.56240568  0.02750865  0.00721451  0.00846768  0.24071721
 -0.0294165  -0.02591353  0.06404163  0.01832137]


In [45]:
#Calculate Odds Ratios
print("\n Odds Ratios: ")
df_or = pd.DataFrame({'Features': newX.columns,
                     'coefficient': clf.coef_[0],
                      'Odds Ratio': np.exp(clf.coef_[0])}, 
                     columns = ['Features', 'coefficient', 'Odds Ratio'])
print(df_or)


 Odds Ratios: 
                          Features  coefficient  Odds Ratio
0                 Lack_of_Appetite     0.177138    1.193796
1                              Age    -0.001269    0.998732
2                LengthOfCare_days    -0.004984    0.995029
3                       Drowsiness     0.126602    1.134965
4                        Tiredness     0.048929    1.050146
5       3_visit_max_lackofappetite     0.125044    1.133198
6                 ESAS_visit_total    -0.011208    0.988855
7                        Wellbeing     0.081022    1.084394
8      3_visit_mean_lackofappetite    -0.073952    0.928717
9              Shortness_of_Breath     0.088851    1.092918
10          3_visit_max_drowsiness     0.188006    1.206840
11           3_visit_max_tiredness    -0.093332    0.910891
12          3_visit_mean_tiredness     0.112962    1.119590
13                      Depression    -0.143051    0.866710
14         3_visit_mean_drowsiness    -0.185852    0.830397
15                      