# Ensemble Based Methods

## 1) Bagging Classifier

#### import libraries

In [3]:
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"

#### assign column names to the data

In [6]:
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
dataframe  = pd.read_csv(url, names = names)

In [10]:
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### read the data in form of array

In [8]:
array = dataframe.values
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

- here first 8 columns are features and the last column is target

In [11]:
X = array[:,0:8]      # 0 to 7 columns
Y = array[:,8]        # 8 column

In [13]:
# features
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [14]:
# target
Y

array([1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0.

#### K Fold cross validation

- as we know in ensemble based learning e take subset of data

In [17]:
kfold = model_selection.KFold(n_splits=10, random_state =7)
cart = DecisionTreeClassifier()
num_trees= 100
model = BaggingClassifier(base_estimator = cart, n_estimators = num_trees, random_state=7)
result = model_selection.cross_val_score(model, X, Y , cv = kfold)

In [18]:
print(result)

[0.67532468 0.81818182 0.75324675 0.63636364 0.81818182 0.81818182
 0.85714286 0.85714286 0.69736842 0.77631579]


In [19]:
print("mean:", result.mean())

mean: 0.770745044429255


#### Voting Ensemble for classification

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [None]:
kfold = model_selection.KFold(n_splits =10, random_state = 7)

#create sub models
estimators =[]
model1 = LogisticRegression()
estimators.append(('logistic',model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart',model2))

model3 = SVC()
estimators.append(('svm',model3))

#create ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble , X, Y, cv = kfold)

In [23]:
print(results)

[0.62337662 0.79220779 0.72727273 0.63636364 0.74025974 0.71428571
 0.81818182 0.80519481 0.72368421 0.72368421]


In [24]:
print(results.mean())

0.7304511278195489


## 2) Boosting (AdaBoost) Classifier

In [25]:
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"

In [26]:
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
dataframe  = pd.read_csv(url, names = names)

In [27]:
array = dataframe.values
X = array[:,0:8]      # 0 to 7 columns
Y = array[:,8]        # 8 column

In [28]:
kfold = model_selection.KFold(n_splits=10, random_state =7)
cart = DecisionTreeClassifier()
num_trees= 100
model = AdaBoostClassifier(base_estimator = cart, n_estimators = num_trees, random_state=7)
result = model_selection.cross_val_score(model, X, Y , cv = kfold)
print("mean:", result.mean())

mean: 0.6900034176349965


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


kfold = model_selection.KFold(n_splits =10, random_state = 7)

#create sub models
estimators =[]
model1 = LogisticRegression()
estimators.append(('logistic',model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart',model2))

model3 = SVC()
estimators.append(('svm',model3))

#create ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble , X, Y, cv = kfold)

In [30]:
print(results.mean())

0.7303656869446343
