In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [26]:
df = pd.read_csv('all_data_combined.csv')
df.head()

Unnamed: 0,subject_id,age,urea_n_min,urea_n_max,urea_n_mean,resprate_min,resprate_max,resprate_mean,glucose_min,glucose_max,...,oasis,lods,gender_F,gender_M,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SEPARATED,marital_status_SINGLE,marital_status_WIDOWED,death
0,15057,58.831224,16.0,37.0,26.111111,10.0,52,20.104478,67.0,405.0,...,36.5,5.5,0,1,0,1,0,0,0,1
1,79262,81.618606,10.0,15.0,13.5,11.0,31,18.704225,119.0,348.0,...,48.0,4.0,0,1,0,1,0,0,0,1
2,77191,79.102744,16.0,56.0,35.5,20.0,31,25.744681,90.0,188.0,...,35.0,6.0,1,0,1,0,0,0,0,0
3,84966,88.232043,8.0,27.0,17.692308,10.0,51,17.770833,92.0,271.0,...,40.5,4.0,1,0,0,0,0,1,0,1
4,94997,90.0,38.0,48.0,42.25,4.0,40,26.36,62.0,135.0,...,35.0,5.0,1,0,0,0,0,1,0,0


In [27]:
def votingBased(df):
    
    kf = KFold(n_splits=5, random_state=0)
    
    result = []
    
    for train, test in kf.split(df):
        
        train_data = df.iloc[train,:]
        test_data =  df.iloc[test,:]

        trainx = train_data.iloc[:,1:(train_data.shape[1]-1)]
        trainy = train_data.iloc[:,(train_data.shape[1]-1)]

        testx = test_data.iloc[:,1:test_data.shape[1]-1]
        testy = test_data.iloc[:,test_data.shape[1]-1]
        
        estimators=[]
        model1 = LogisticRegression()
        estimators.append(('logistic', model1))
        model2 = DecisionTreeClassifier()
        estimators.append(('decisionTree', model2))
        model3 = GaussianNB()
        estimators.append(('nbayes', model3))
        model4 = RandomForestClassifier()
        estimators.append(('rforrest', model4))
        model5 = AdaBoostClassifier()
        estimators.append(('adaboost', model5))

        # create the ensemble model
        mix_model = VotingClassifier(estimators,voting='soft')
        mix_model.fit(trainx,trainy)
        result.append(mix_model.score(testx, testy))
        
    return result

In [22]:
def votingBasedPara(df):
    
    kf = KFold(n_splits=5, random_state=0)
    
    result = []
    
    for train, test in kf.split(df):
        
        train_data = df.iloc[train,:]
        test_data =  df.iloc[test,:]

        trainx = train_data.iloc[:,1:(train_data.shape[1]-1)]
        trainy = train_data.iloc[:,(train_data.shape[1]-1)]

        testx = test_data.iloc[:,1:test_data.shape[1]-1]
        testy = test_data.iloc[:,test_data.shape[1]-1]
        
        estimators=[]
        model1 = LogisticRegression(penalty='l1')
        estimators.append(('logistic', model1))
        model2 = DecisionTreeClassifier(criterion = 'entropy',max_depth=4,min_samples_leaf=32)
        estimators.append(('decisionTree', model2))
        model3 = GaussianNB()
        estimators.append(('nbayes', model3))
        model4 = RandomForestClassifier()
        estimators.append(('rforrest', model4))
        model5 = AdaBoostClassifier(DecisionTreeClassifier(criterion = 'entropy',max_depth=4))
        estimators.append(('adaboost', model5))

        # create the ensemble model
        mix_model = VotingClassifier(estimators,voting='soft')
        mix_model.fit(trainx,trainy)
        result.append(mix_model.score(testx, testy))
        
    return result

In [28]:
# voting = soft
res = votingBased(df)
np.average(res)

0.7885792909944425

In [23]:
# voting = hard
res = votingBasedClassifier(df)
np.average(res)

0.8118113635137323