In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier

#for validating your classification model
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

  from pandas.core import datetools


# Loading data

In [2]:
df=pd.read_csv("data/heartattack_train.csv")
print df.head()

   Age  Marital_Status  Gender  Weight_Category  Cholesterol  \
0   60               2       0                1          150   
1   69               2       1                1          170   
2   52               1       0                0          174   
3   66               2       1                1          169   
4   70               3       0                1          237   

   Stress_Management  Trait_Anxiety 2nd_Heart_Attack  
0                  1             50              Yes  
1                  0             60              Yes  
2                  1             35               No  
3                  0             60              Yes  
4                  0             65              Yes  


# Data wrangling & ETL: Data cleaningg & transformation

In [3]:
#mappling or replacing
df = df.replace({'2nd_Heart_Attack': 'No'}, {'2nd_Heart_Attack': '0'})
df = df.replace({'2nd_Heart_Attack': 'Yes'}, {'2nd_Heart_Attack': '1'})

In [4]:
#or you can do this to convert object to number
df['2nd_Heart_Attack'] = df['2nd_Heart_Attack'].astype(int)

In [5]:
y = df['2nd_Heart_Attack']
X = df.drop(['2nd_Heart_Attack'], axis=1)

# Model Building & Validation with "Feature Selection"
> Feature Selection = the process of building a predictive model with few predictors

> # 1. SelectKBest 

> Removes all but the k highest scoring features (where k is the number of X variables given by data analyst)

> SelectKBest takes the results of chi-square for classification problem. chi-square tests if an individual X variable is independent of y variable. All X variables are tested. If found to be independent, the X variable is removed 

http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2

In [8]:
X_new = SelectKBest(chi2, k=3).fit_transform(X, y)
print X_new

[[  1 150  50]
 [  1 170  60]
 [  0 174  35]
 [  1 169  60]
 [  1 237  65]
 [  0 174  35]
 [  0 140  45]
 [  0 143  45]
 [  0 139  45]
 [  0 174  40]
 [  0 189  65]
 [  1 147  50]
 [  2 160  40]
 [  2 178  75]
 [  2 236  80]
 [  1 146  50]
 [  0 141  45]
 [  0 172  60]
 [  0 172  60]
 [  1 138  50]
 [  0 174  40]
 [  1 146  50]
 [  1 238  60]
 [  0 172  35]
 [  2 178  75]
 [  1 236  65]
 [  2 202  70]
 [  0 140  45]
 [  0 173  35]
 [  0 124  45]
 [  2 224  60]
 [  2 203  70]
 [  1 169  50]
 [  0 175  40]
 [  2 220  60]
 [  1 169  60]
 [  0 125  45]
 [  2 162  40]
 [  2 162  40]
 [  1 238  60]
 [  0 170  60]
 [  0 171  60]
 [  0 187  65]
 [  2 182  75]
 [  1 235  60]
 [  0 123  45]
 [  1 172  55]
 [  1 139  50]
 [  2 199  70]
 [  0 175  40]
 [  2 203  70]
 [  2 161  40]
 [  0 139  45]
 [  1 139  50]
 [  1 236  65]
 [  1 236  60]
 [  1 233  65]
 [  1 165  50]
 [  1 139  50]
 [  0 172  60]
 [  0 172  40]
 [  0 122  45]
 [  2 179  75]
 [  0 186  65]
 [  0 141  45]
 [  1 148  50]
 [  2 203 

**Weight_Category, Cholesterol, and Trait_Anxiety** are selected as three best predictors

In [9]:
# evaluate the model by splitting into train (70%) and test sets (30%)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
#model2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

#Model evaluation
print metrics.accuracy_score(y_test, dt.predict(X_test))
print "--------------------------------------------------------"
print metrics.confusion_matrix(y_test, dt.predict(X_test)) 
print "--------------------------------------------------------"
print metrics.classification_report(y_test, dt.predict(X_test))
print "--------------------------------------------------------"
print metrics.roc_auc_score(y_test, dt.predict(X_test))

0.928571428571
--------------------------------------------------------
[[18  1]
 [ 2 21]]
--------------------------------------------------------
             precision    recall  f1-score   support

          0       0.90      0.95      0.92        19
          1       0.95      0.91      0.93        23

avg / total       0.93      0.93      0.93        42

--------------------------------------------------------
0.930205949657


In [None]:
# visualizing the new decision tree
X_new_df = pd.DataFrame(X_new)
X_new_df = X_new_df.rename(columns={0: 'Weight_Category', 1: 'Cholesterol', 3: 'Trait_Anxiety'})
tree.export_graphviz(dt, out_file='data/decisiontree.dot', feature_names=X_new_df.columns)

In [None]:
# visualizing the new decision tree (2nd option)
from sklearn.externals.six import StringIO
import pydotplus

dot_data = StringIO() 
tree.export_graphviz(dt, out_file=dot_data, feature_names=X_new_df.columns,
                     filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("data/dt.pdf")
# go to data folder and open the pdf file

In [None]:
# develop logistic regression model with X_new (only three predictors or independent variables)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)

#Model evaluation
print metrics.accuracy_score(y_test, lr.predict(X_test))
print metrics.confusion_matrix(y_test, lr.predict(X_test))
print metrics.classification_report(y_test, lr.predict(X_test))
print metrics.roc_auc_score(y_test, lr.predict(X_test))

> # 2. Recursive Feature Selection (RFE)

In [12]:
model = LogisticRegression()
rfe = RFE(model, 4)  #asking four best attributes
rfe = rfe.fit(X, y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True  True False  True False  True False]
[1 1 3 1 4 1 2]


In [13]:
X.head(1)

Unnamed: 0,Age,Marital_Status,Gender,Weight_Category,Cholesterol,Stress_Management,Trait_Anxiety
0,60,2,0,1,150,1,50


In [14]:
# Features sorted by their rank
pd.DataFrame({'feature':X.columns, 'importance':rfe.ranking_})

Unnamed: 0,feature,importance
0,Age,1
1,Marital_Status,1
2,Gender,3
3,Weight_Category,1
4,Cholesterol,4
5,Stress_Management,1
6,Trait_Anxiety,2


In [15]:
#here I select 4 most significant features only (including Age)
X_logistic = df[['Age', 'Marital_Status', 'Weight_Category', 'Stress_Management']]
print X_logistic.head()

   Age  Marital_Status  Weight_Category  Stress_Management
0   60               2                1                  1
1   69               2                1                  0
2   52               1                0                  1
3   66               2                1                  0
4   70               3                1                  0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_logistic, y, test_size=0.3, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)

#Model evaluation
print metrics.accuracy_score(y_test, lr.predict(X_test))
print metrics.confusion_matrix(y_test, lr.predict(X_test))
print metrics.classification_report(y_test, lr.predict(X_test))
print metrics.roc_auc_score(y_test, lr.predict(X_test))

0.904761904762
[[16  3]
 [ 1 22]]
             precision    recall  f1-score   support

          0       0.94      0.84      0.89        19
          1       0.88      0.96      0.92        23

avg / total       0.91      0.90      0.90        42

0.899313501144


> # 3. Extra tree classifier: Tree-based feature selection
> - http://scikit-learn.org/stable/modules/ensemble.html

In [10]:
model_extra = ExtraTreesClassifier()
model_extra.fit(X, y)
model_extra.score(X, y)

# display the relative importance of each attribute
print(model_extra.feature_importances_)

[ 0.09142621  0.22133476  0.0450213   0.28711802  0.1359188   0.0730589
  0.14612201]


In [11]:
print "Features sorted by their rank:"
print sorted(zip(map(lambda x: round(x, 4), model_extra.feature_importances_), X.columns))

Features sorted by their rank:
[(0.045, 'Gender'), (0.0731, 'Stress_Management'), (0.0914, 'Age'), (0.1359, 'Cholesterol'), (0.1461, 'Trait_Anxiety'), (0.2213, 'Marital_Status'), (0.2871, 'Weight_Category')]


# Conclusion & Storytelling

- All three classification algorithms (decision tree, logistic regression, knn) work well for this dataset
- Certain predictors are found important in predicting who is likely to experience 2nd heart attack
> Weight_Category, Cholesterol, Martial_Status, Trait_Anxiety, ...

# Appendix : iris dataset

<img src="images/iris.png">
<img src="images/iris_3.gif">

In [None]:
iris = pd.read_csv('data/iris.csv')
iris.head(2)

In [None]:
iris.groupby('Name').count()

In [None]:
#setting X & y



In [None]:
# Build a decision model 



In [None]:
# Find out the performance of this model & interpret the results
# just get accuracy_score and confusion_matrix



In [None]:
# Visualize decision tree



In [None]:
# Embed the decision tree here



# Feature Selection: Building a predictive model with fewer predictors

> ## 1. SelectBest

In [None]:
# SelectBest (k =2)



What predictors are found to be important in predicting iris type?
- PetalLength, PetalWidth

In [None]:
# then, declare X again with the two columns (not four). Name it as X_new (we want to use a different variable name not to overwrite X) 



In [None]:
# build decision tree model with two predictors (X_new)




# Find out the performance of this model & interpret the results
# just get accuracy_score and confusion_matrix




> ## 2. Recursive Feature Selection

In [None]:
# build logisticRegression

# selecting 3 highest ranking X variables




In [None]:
# summarize the selection of the attributes



What predictors are found to be important in predicting iris type?
- SepalWidth, PetalLength, PetalWidth

> ## 3. ExtraTreesClassifier

In [None]:
# build ExtraTreesClassifier



# display the relative importance of each attribute

