In [2]:
import pandas as pd
import sklearn.metrics as metrics
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [3]:
# utility functions
# get cross validation score using KFold
def k_fold_eval(model, X, Y):
    kfold = KFold(n_splits=10, random_state=seed) 
    score = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
    
    return score

In [4]:
data_frame = pd.read_csv('../data/pima-indians-diabetes.csv')
print(data_frame.columns)

target_column = 'Outcome'
feature_names = data_frame.columns.drop(target_column)
# split by field name
X = data_frame[feature_names]
Y = data_frame[target_column].values

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [5]:
scoring = 'accuracy'
seed = 7

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

In [6]:
# evaluate using test/train split
validation_size = 0.20
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, stratify=data_frame.Outcome, test_size=validation_size, random_state=seed)

names = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
for name, model in models:
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy_scores.append(metrics.accuracy_score(Y_test, Y_pred))
    precision_scores.append(metrics.precision_score(Y_test, Y_pred))
    recall_scores.append(metrics.recall_score(Y_test, Y_pred))
    f1_scores.append(metrics.f1_score(Y_test, Y_pred))    
    names.append(name)
    print(name)    
    print(metrics.confusion_matrix(Y_test, Y_pred))

tr_split = pd.DataFrame({'Name': names})
tr_split['Accuracy Score'] = accuracy_scores
tr_split['Precision Score'] = precision_scores
tr_split['Recall Score'] = recall_scores
tr_split['F1 Score'] = f1_scores
print(tr_split)

LR
[[88 12]
 [27 27]]
LDA
[[88 12]
 [21 33]]
KNN
[[85 15]
 [26 28]]
CART
[[81 19]
 [26 28]]
GNB
[[87 13]
 [24 30]]
SVM
[[100   0]
 [ 54   0]]
RFC
[[91  9]
 [23 31]]
GBC
[[86 14]
 [23 31]]
   Name  Accuracy Score  Precision Score  Recall Score  F1 Score
0    LR        0.746753         0.692308      0.500000  0.580645
1   LDA        0.785714         0.733333      0.611111  0.666667
2   KNN        0.733766         0.651163      0.518519  0.577320
3  CART        0.707792         0.595745      0.518519  0.554455
4   GNB        0.759740         0.697674      0.555556  0.618557
5   SVM        0.649351         0.000000      0.000000  0.000000
6   RFC        0.792208         0.775000      0.574074  0.659574
7   GBC        0.759740         0.688889      0.574074  0.626263


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [7]:
# evaluate using KFold
names = []
scores = []
for name, model in models:
    
    score = k_fold_eval(model, X, Y).mean()
    
    names.append(name)
    scores.append(score)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

   Name     Score
0    LR  0.769515
1   LDA  0.773462
2   KNN  0.726555
3  CART  0.696531
4   GNB  0.755178
5   SVM  0.651025
6   RFC  0.738243
7   GBC  0.768199


In [8]:
# Use Recursive Feature Elimination to establish which features contribute the most 
from sklearn.feature_selection import RFECV

print(feature_names)

new_scores = []
for name, model in models:
    msg = "Name %s " % (name)    
    print(msg)
    
    kfold = KFold(n_splits=10, random_state=seed) 
    rfecv = RFECV(estimator=model, step=1, cv=kfold, scoring='accuracy')
    try:
        rfecv.fit(X, Y)
        feature_importance = list(zip(feature_names, rfecv.support_))
        new_features = []
        for key,value in enumerate(feature_importance):
            if(value[1]) == True:
                new_features.append(value[0])
            
        print(new_features)    
        
        # get new X with only selected features
        new_X = data_frame[new_features]
        
        #evaluate using Kfold
        new_score = k_fold_eval(model, new_X, Y).mean()
        new_scores.append(new_score)
        
    except RuntimeError as e:
        print("Unable to apply RFECV")
        new_scores.append(None)
        
kf_cross_val['New Score'] = new_scores        
print(kf_cross_val)       

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')
Name LR 
['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction']
Name LDA 
['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Name KNN 
Unable to apply RFECV
Name CART 
['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Name GNB 
Unable to apply RFECV
Name SVM 
Unable to apply RFECV
Name RFC 
['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Name GBC 
['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
   Name     Score  New Score
0    LR  0.769515   0.769532
1   LDA  0.773462   0.777358
2   KNN  0.726555        NaN
3  CART  0.696531   0.696497
4   GNB  0.755178        NaN
5   SVM  0.651025        NaN
6   RFC  0.738243   0.740926
7   GBC  0.768199   0.764337


In [9]:
# get max model no feature elimination

best_model_no_rfe = kf_cross_val.iloc[kf_cross_val['Score'].idxmax()]
print(best_model_no_rfe)

# get max model with feature elimination

best_model_w_rfe = kf_cross_val.iloc[kf_cross_val['New Score'].idxmax()]
print(best_model_w_rfe)

Name              LDA
Score        0.773462
New Score    0.777358
Name: 1, dtype: object
Name              LDA
Score        0.773462
New Score    0.777358
Name: 1, dtype: object
