In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn import metrics
from sklearn.decomposition import PCA
from scipy.stats import zscore
import matplotlib.pyplot as plt 

In [2]:
colnames = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
prima_df = pd.read_csv("pima-indians-diabetes.data",names=colnames)

In [3]:
X=prima_df[['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age']]
Y=prima_df['outcome']

In [4]:
sc=StandardScaler()
X=sc.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [5]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
base_knn=KNeighborsClassifier(n_neighbors=7,weights='distance')
base_nb=GaussianNB()
base_LR=LogisticRegression(random_state=2)
base_rf=RandomForestClassifier(n_estimators=101,random_state=2)
gb_model=GradientBoostingClassifier(n_estimators=50,random_state=2)

In [6]:
bag_knn=BaggingClassifier(base_estimator=base_knn,n_estimators=17,random_state=2)

In [7]:
bag_LR=BaggingClassifier(base_estimator=base_LR,n_estimators=15,random_state=2)
boost_LR = AdaBoostClassifier(base_estimator=base_LR,n_estimators=50,random_state=2)

In [8]:
bag_nb=BaggingClassifier(base_estimator=base_nb,n_estimators=15,random_state=2)
boost_nb = AdaBoostClassifier(base_estimator=base_nb,n_estimators=51,random_state=2)

In [9]:
boost_rf=AdaBoostClassifier(base_estimator=base_rf,n_estimators=50,random_state=2)

In [10]:
bag_dt=BaggingClassifier(n_estimators=15,random_state=2)
boost_dt = AdaBoostClassifier(n_estimators=50,random_state=2)

In [11]:
stacked = VotingClassifier(estimators = [('Boosted_LR',boost_LR),('RF', base_rf), ('Boosted_DT', boost_dt)],voting='soft')

In [12]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [13]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=True,random_state=2)
for model, name in zip([boost_LR,base_rf,boost_dt,gb_model,stacked], ['BoostLR','RF','BoostedDT','GradientBoost','stacked']):
    k=0
    recall=np.zeros((2,5))
    prec=np.zeros((2,5))
    fscore=np.zeros((2,5))
    for train,test in kf.split(X,Y):
        Xtrain,Xtest=X[train,:],X[test,:]
        Ytrain,Ytest=Y[train],Y[test]
        model.fit(Xtrain,Ytrain)
        Y_predict=model.predict(Xtest)
        cm=metrics.confusion_matrix(Ytest,Y_predict)
        for i in np.arange(0,2):
            recall[i,k]=cm[i,i]/cm[i,:].sum()
        for i in np.arange(0,2):
            prec[i,k]=cm[i,i]/cm[:,i].sum()
        k=k+1
    for row in np.arange(0,2):
        for col in np.arange(0,5):
            fscore[row,col]=2*(recall[row,col]*prec[row,col])/(recall[row,col]+prec[row,col])
    print("f1_weighted for Healthy: %0.02f (+/- %0.5f) [%s]" % (np.mean(fscore[0,:]), np.var(fscore[0,:],ddof=1), name ))   
    print("f1_weighted for Diabetic: %0.02f (+/- %0.5f) [%s]" % (np.mean(fscore[1,:]), np.var(fscore[1,:],ddof=1), name ))   
    

f1_weighted for Healthy: 0.83 (+/- 0.00048) [BoostLR]
f1_weighted for Diabetic: 0.62 (+/- 0.00222) [BoostLR]
f1_weighted for Healthy: 0.81 (+/- 0.00029) [RF]
f1_weighted for Diabetic: 0.62 (+/- 0.00171) [RF]
f1_weighted for Healthy: 0.81 (+/- 0.00028) [BoostedDT]
f1_weighted for Diabetic: 0.60 (+/- 0.00683) [BoostedDT]
f1_weighted for Healthy: 0.82 (+/- 0.00037) [GradientBoost]
f1_weighted for Diabetic: 0.62 (+/- 0.00248) [GradientBoost]
f1_weighted for Healthy: 0.81 (+/- 0.00020) [stacked]
f1_weighted for Diabetic: 0.62 (+/- 0.00166) [stacked]


In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
kf=KFold(n_splits=5,shuffle=True,random_state=2)
for model, name in zip([boost_LR,base_rf,boost_dt,gb_model,stacked], ['BoostLR','RF','BoostedDT','GradientBoost','stacked']):
    roc_auc=[]
    for train,test in kf.split(X,Y):
        Xtrain,Xtest=X[train,:],X[test,:]
        Ytrain,Ytest=Y[train],Y[test]
        model.fit(Xtrain,Ytrain)
        Y_predict=model.predict(Xtest)
        cm=metrics.confusion_matrix(Ytest,Y_predict)
        fpr,tpr, _ = roc_curve(Ytest,Y_predict)
        roc_auc.append(auc(fpr, tpr))
    print("AUC scores: %0.02f (+/- %0.5f) [%s]" % (np.mean(roc_auc), np.var(roc_auc,ddof=1), name ))   
    

AUC scores: 0.72 (+/- 0.00086) [BoostLR]
AUC scores: 0.71 (+/- 0.00047) [RF]
AUC scores: 0.70 (+/- 0.00229) [BoostedDT]
AUC scores: 0.71 (+/- 0.00093) [GradientBoost]
AUC scores: 0.71 (+/- 0.00042) [stacked]


In [15]:
(0.00046-0.00040)/0.00046
(0.72-0.7)/0.72


0.027777777777777804

In [None]:
# Conclusion

In [19]:
# In case of Biased error >> AUC scores: 0.72 (+/- 0.00086) [BoostLR] model gives best result
# In case of varience error >> AUC scores: 0.71 (+/- 0.00042) [stacked] model gives better result

# Best model with overall result will be :
# 1) AUC scores: 0.72 (+/- 0.00086) [BoostLR]
# 2) AUC scores: 0.71 (+/- 0.00042) [stacked]
# 3) AUC scores: 0.71 (+/- 0.00047) [RF]

# BoostedLR has highest avarage value of 0.00086 out of above 3 models, so seleted BoostedLR first.
# Out of Stacked and RF , Stacked has less varience error of 0.00042 so it is selected
# The weighted result difference between BoostedLR and Stacked is (0.72-0.71)/0.72 & (0.00086-0.00042)/0.00086

# Stacked model is better in varience error by 51% by weighted avg value so it is the best model out of above 3.


In [20]:
(0.72-0.71)/0.72

0.013888888888888902

In [21]:
(0.00086-0.00042)/0.00086

0.5116279069767442