# Bagging

In [26]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [2]:
#Load Brest Cancer dataset
bc = datasets.load_breast_cancer()
X = bc.data
y = bc.target

In [14]:
df = pd.DataFrame(data=bc.data, columns=bc.feature_names)
df["target"] = bc.target

In [16]:
# Dimensões do df
print("Dimensões do DataFrame:")
print("Linhas:\t\t{}".format(df.shape[0]))
print("Colunas:\t{}".format(df.shape[1]))

Dimensões do DataFrame:
Linhas:		569
Colunas:	31


In [15]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


In [17]:
# Valores Ausentes
print("\nValores Ausentes:")
print((df.isnull().sum() / df.shape[0]).sort_values(ascending=False))


Valores Ausentes:
target                     0.0
smoothness error           0.0
mean texture               0.0
mean perimeter             0.0
mean area                  0.0
mean smoothness            0.0
mean compactness           0.0
mean concavity             0.0
mean concave points        0.0
mean symmetry              0.0
mean fractal dimension     0.0
radius error               0.0
texture error              0.0
perimeter error            0.0
area error                 0.0
compactness error          0.0
worst fractal dimension    0.0
concavity error            0.0
concave points error       0.0
symmetry error             0.0
fractal dimension error    0.0
worst radius               0.0
worst texture              0.0
worst perimeter            0.0
worst area                 0.0
worst smoothness           0.0
worst compactness          0.0
worst concavity            0.0
worst concave points       0.0
worst symmetry             0.0
mean radius                0.0
dtype: float64


In [18]:
# Create training and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

In [19]:
# Pipeline Estimator

pipeline = make_pipeline(StandardScaler(),
                        LogisticRegression(random_state=1))

In [20]:
# Fit the model
pipeline.fit(X_train, y_train)

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [21]:
# Model scores on test and training data
print('Model test Score: %.3f, ' %pipeline.score(X_test, y_test),
      'Model training Score: %.3f' %pipeline.score(X_train, y_train))

Model test Score: 0.965,  Model training Score: 0.991


O modelo apresenta as seguintes pontuações. Observe que o modelo tende a overfitting, pois a pontuação do teste é 0,965 e a pontuação do treinamento é 0,991.Como ja conhecemos overfitting significa vies baixo, variancia alta, pelo que vamos aplicar Bagging classifier.

The model comes up with the following scores. Note that the model tends to overfit the data as the test score is 0.965 and the training score is 0.991.As we already know overfitting means low bias, high variance, so let's apply Bagging classifier.

In [22]:
# Pipeline Estimator

pipeline = make_pipeline(StandardScaler(),
                        LogisticRegression(random_state=1))

In [23]:
# Instantiate the bagging classifier
bgclassifier = BaggingClassifier(base_estimator=pipeline, n_estimators=100,
                                 max_features=10,
                                 max_samples=100,
                                 random_state=1, n_jobs=5)

In [24]:
# Fit the bagging classifier
bgclassifier.fit(X_train, y_train)

BaggingClassifier(base_estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
         bootstrap=True, bootstrap_features=False, max_features=10,
         max_samples=100, n_estimators=100, n_jobs=5, oob_score=False,
         random_state=1, verbose=0, warm_start=False)

In [31]:
# Model scores on test and training data
print('Model test Score: %.3f, ' %bgclassifier.score(X_test, y_test),
      'Model training Score: %.3f' %bgclassifier.score(X_train, y_train))

Model test Score: 0.958,  Model training Score: 0.977


O modelo apresenta as seguintes pontuações. Observe que o modelo tende a superajustar os dados, pois a pontuação do teste é 0,958 e a pontuação do treinamento é 0,977. No entanto, o modelo apresentará melhor desempenho de generalização do que o modelo ajustado com Regressão Logística.

The model comes up with the following scores. Note that the model tends to overfit the data as the test score is 0.958 and the training score is 0.977. However, the model will give better generalization performance than the model fit with Logistic Regression.


In [27]:
# Score do modelo
scores = cross_val_score(bgclassifier, X, y)



In [28]:
# Média do score
mean = scores.mean()

In [29]:
print(scores)

[0.97368421 0.96842105 0.97883598]


In [30]:
print(mean)

0.9736470806646246
