# Ensemble Learning: Bagging

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

## Data

In [2]:
breast_cancer = pd.read_csv('../Data/breast_cancer.csv')
breast_cancer.drop(['id'], axis=1, inplace=True)
breast_cancer_dummies = pd.get_dummies(breast_cancer, columns=['diagnosis'], drop_first=True)
breast_cancer_dummies.rename(columns={'diagnosis_M': 'Malignant'}, inplace=True)
breast_cancer_dummies.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Malignant
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [3]:
inputs = breast_cancer_dummies.drop(['Malignant'],axis=1)
target = breast_cancer_dummies.Malignant
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

In [4]:
# Scale Data
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(StandardScaler().fit_transform(inputs), target, test_size=0.2)
X_train_scaled

array([[ 0.89825404,  0.66096886,  0.92393601, ...,  0.58156126,
         0.26894679, -0.50460631],
       [ 0.14561642, -0.56772258,  0.09230586, ..., -0.55312478,
        -0.72113495, -0.99669876],
       [-0.06455408, -0.01155354, -0.13341643, ..., -0.50409513,
        -0.88129523, -0.43866148],
       ...,
       [-0.05319351, -1.42408329, -0.06833592, ..., -0.19956318,
        -1.19838023, -0.63261684],
       [ 0.03485089,  0.56555911,  0.06841555, ...,  0.45822582,
         1.11666383,  0.96723781],
       [-0.81151142, -1.4729517 , -0.77474785, ...,  0.26180272,
         0.71545423,  0.4546415 ]])

## Bagging Classifier
Bootstrap Aggregation

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
bagModel = BaggingClassifier(
    base_estimator=RandomForestClassifier(),
    n_estimators=400,
    max_samples=0.8,
    oob_score=True, # Out of Bag Score,
    verbose=3,
)
bagModel.fit(X_train_scaled, y_train_scaled)
bagModel.oob_score_

Building estimator 165 of 1000 for this parallel run (total 1000)...
Building estimator 166 of 1000 for this parallel run (total 1000)...
Building estimator 167 of 1000 for this parallel run (total 1000)...
Building estimator 168 of 1000 for this parallel run (total 1000)...
Building estimator 169 of 1000 for this parallel run (total 1000)...
Building estimator 170 of 1000 for this parallel run (total 1000)...
Building estimator 171 of 1000 for this parallel run (total 1000)...
Building estimator 172 of 1000 for this parallel run (total 1000)...
Building estimator 173 of 1000 for this parallel run (total 1000)...
Building estimator 174 of 1000 for this parallel run (total 1000)...
Building estimator 175 of 1000 for this parallel run (total 1000)...
Building estimator 176 of 1000 for this parallel run (total 1000)...
Building estimator 177 of 1000 for this parallel run (total 1000)...
Building estimator 178 of 1000 for this parallel run (total 1000)...
Building estimator 179 of 1000 for

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min finished


0.9604395604395605

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building estimator 1 of 400 for this parallel run (total 400)...
Building estimator 2 of 400 for this parallel run (total 400)...
Building estimator 3 of 400 for this parallel run (total 400)...
Building estimator 4 of 400 for this parallel run (total 400)...
Building estimator 5 of 400 for this parallel run (total 400)...
Building estimator 6 of 400 for this parallel run (total 400)...
Building estimator 7 of 400 for this parallel run (total 400)...
Building estimator 8 of 400 for this parallel run (total 400)...
Building estimator 9 of 400 for this parallel run (total 400)...
Building estimator 10 of 400 for this parallel run (total 400)...
Building estimator 11 of 400 for this parallel run (total 400)...
Building estimator 12 of 400 for this parallel run (total 400)...
Building estimator 13 of 400 for this parallel run (total 400)...
Building estimator 14 of 400 for this parallel run (total 400)...
Building estimator 15 of 400 for this parallel run (total 400)...
Building estimator 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.6s finished


0.9604395604395605