# Voting, Stacking, and Super Learner

## Import Libraries

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## Load Dataset

In [5]:
df = pd.read_csv('Data/heart_disease_clean.csv')

In [6]:
df.shape

(303, 14)

In [7]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,male,typical angina,145.0,233.0,True,2.0,150.0,no,2.3,downsloping,0.0,6.0,0
1,67.0,male,asymptomatic,160.0,286.0,False,2.0,108.0,yes,1.5,flat,3.0,3.0,1
2,67.0,male,asymptomatic,120.0,229.0,False,2.0,129.0,yes,2.6,flat,2.0,7.0,1
3,37.0,male,non-anginal pain,130.0,250.0,False,0.0,187.0,no,3.5,downsloping,0.0,3.0,0
4,41.0,female,atypical angina,130.0,204.0,False,2.0,172.0,no,1.4,upsloping,0.0,3.0,0


In [8]:
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
nominal_features = ['sex', 'cp', 'exang', 'slope', 'thal', 'fbs', 'restecg', 'ca']

## Split Data Set

In [19]:
dummies = pd.get_dummies(df[nominal_features],drop_first=True)
df = df.drop(nominal_features,axis=1)
df = pd.concat([df,dummies],axis=1)

In [20]:
X = df.drop("num", axis=1)
y = df["num"]

In [21]:
y = LabelEncoder().fit_transform(y)

In [22]:
print(X.shape, y.shape)

(303, 16) (303,)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

In [24]:
print (X_train.shape, y_train.shape)

(212, 16) (212,)


## Voting Ensemble

In [25]:
from sklearn.ensemble import VotingClassifier

In [26]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [27]:
def create_voting_model():
    # create the sub models
    estimators = []
    model1 = LogisticRegression(solver='liblinear')
    estimators.append(('logistic', model1))
    model2 = LinearDiscriminantAnalysis()
    estimators.append(('lda', model2))
    model3 = DecisionTreeClassifier()
    estimators.append(('cart', model3))
    model4 = SVC(gamma='auto')
    estimators.append(('svm', model4))
    # create the ensemble model
    ensemble = VotingClassifier(estimators=estimators, voting='hard')
    return ensemble

In [28]:
ensemble = create_voting_model()
results = cross_val_score(ensemble, X, y, cv=cv)

In [29]:
print(results.mean())

0.787741935483871


## Stacking Ensemble

In [30]:
from sklearn.ensemble import StackingClassifier

In [31]:
def get_stacking():
    # define the base models
    level0 = list()
    
    level0.append(('lr', LogisticRegression(solver='liblinear')))
    
    level0.append(('LDA', LinearDiscriminantAnalysis()))
    
    level0.append(('knn', KNeighborsClassifier()))
    
    level0.append(('cart', DecisionTreeClassifier()))
   
    level0.append(('svm', SVC()))
    
    level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = LogisticRegression(solver='liblinear')
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

In [32]:
ensemble = get_stacking()
results = cross_val_score(ensemble, X, y, cv=cv)

In [33]:
print(results.mean())

0.8393548387096775


## Super Learner

In [34]:
from mlens.ensemble import SuperLearner

In [35]:
scaler = MinMaxScaler()

In [36]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
models = []
models.append(KNeighborsClassifier(n_neighbors=15, weights='distance', p=1))
models.append(GaussianNB())
models.append(SVC(C=1000))
models.append(DecisionTreeClassifier(criterion='entropy', min_samples_split=300, max_leaf_nodes=200))
models.append(RandomForestClassifier(n_estimators=300, min_samples_split=400, n_jobs=100))
models.append(AdaBoostClassifier(n_estimators=400))
models.append(GradientBoostingClassifier(learning_rate=0.9, min_samples_split=400))
models.append(LinearDiscriminantAnalysis())
#models.append(MLPClassifier())
models.append(LogisticRegression(solver='liblinear', penalty='l1'))

In [38]:
ensemble = SuperLearner(scorer=accuracy_score, folds=10, shuffle=True, sample_size=len(X_train), random_state=0)
ensemble.add(models)
ensemble.add_meta(AdaBoostClassifier(n_estimators=400))

SuperLearner(array_check=None, backend=None, folds=10,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=0, shuffle=True,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=10, raise_on_excep...9771AF0>)],
   n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=0, sample_size=212,
       scorer=<function accuracy_score at 0x0000027659771AF0>,
       shuffle=True, verbose=False)

In [39]:
ensemble.fit(X_train, y_train)
ensemble_Y_pred = ensemble.predict(X_test)

print('Accuracy using Ensemble Learning : {:.2f}'.format(accuracy_score(y_test, ensemble_Y_pred) * 100), '%')

Accuracy using Ensemble Learning : 80.22 %


In [50]:
models = list()
models.append(LogisticRegression(solver='liblinear'))
models.append(DecisionTreeClassifier())
models.append(SVC(gamma='scale', probability=True))
models.append(GaussianNB())
models.append(KNeighborsClassifier())
models.append(BaggingClassifier(n_estimators=1000))
models.append(RandomForestClassifier(n_estimators=1000))
models.append(ExtraTreesClassifier(n_estimators=1000))
models.append(AdaBoostClassifier(n_estimators=1000))
models.append(GradientBoostingClassifier(n_estimators=1000))
#models.append(XGBClassifier())

In [51]:
ensemble = SuperLearner(scorer=accuracy_score, folds=10, shuffle=True, sample_size=len(X_train))
ensemble.add(models)
ensemble.add_meta(LogisticRegression(solver='lbfgs'))

SuperLearner(array_check=None, backend=None, folds=10,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=None, shuffle=True,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=10, raise_on_ex...9771AF0>)],
   n_jobs=-1, name='group-7', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=None, sample_size=212,
       scorer=<function accuracy_score at 0x0000027659771AF0>,
       shuffle=True, verbose=False)

In [52]:
ensemble.fit(X_train, y_train)

SuperLearner(array_check=None, backend=None, folds=10,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=None, shuffle=True,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=10, raise_on_ex...9771AF0>)],
   n_jobs=-1, name='group-7', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=None, sample_size=212,
       scorer=<function accuracy_score at 0x0000027659771AF0>,
       shuffle=True, verbose=False)

In [53]:
ensemble_Y_pred = ensemble.predict(X_test)

print('Accuracy using Ensemble Learning : {:.2f}'.format(accuracy_score(y_test, ensemble_Y_pred) * 100), '%')

Accuracy using Ensemble Learning : 84.62 %
