In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

X,y = make_classification(n_samples=10000, n_features=10, n_informative=3)


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision Tree accuracy",accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.88


In [None]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators= 500,
    max_samples= 0.5, #means we are taking 50% of the rows i.e only 4000 rows
    bootstrap=True,
    random_state=42
)

In [None]:
bag.fit(X_train,y_train)



In [None]:
y_pred = bag.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

0.914

In [None]:
bag.estimators_samples_[0].shape

(4000,)

In [None]:
bag.estimators_features_[0].shape

(10,)

## Bagging using SVM

In [None]:
bag = BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)


bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Bagging using SVM",accuracy_score(y_test,y_pred))



Bagging using SVM 0.9015


##Pasting

In [None]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,      # sampling without replacement is pasting
    random_state=42,
    verbose = 1,
    n_jobs=-1
)

In [None]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Pasting classifier",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Pasting classifier 0.912


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s finished


##Random Subspaces
Here we don't perform row sampling, we only do column sampling

In [None]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,  # this means we are taking all the samples i.e. 8000 rows
    bootstrap=False,
    max_features=0.5, # this means we will only use 5 columns out of 10
    bootstrap_features=True,
    random_state=42
)

In [None]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Subspaces classifier",accuracy_score(y_test,y_pred))



Random Subspaces classifier 0.9


In [None]:
bag.estimators_samples_[0].shape

(8000,)

In [None]:
bag.estimators_features_[0].shape

(5,)

##Random Patches
Here we perform both row sampling as well as column sampling

In [None]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)


bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Patches classifier",accuracy_score(y_test,y_pred))



Random Patches classifier 0.9095


## OOB score

In [None]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    random_state=42
)


bag.fit(X_train,y_train)

y_pred = bag.predict(X_test)
print("OOB score",accuracy_score(y_test,y_pred))



OOB score 0.9125



##Bagging Tips

1. Bagging generally gives better results
than Pasting
2.  Good results come around the 25% to 50% row sampling mark
3. Random patches and subspaces should be used while dealing with high dimensional data
4. To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV


  
  
  
  





In [None]:
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [50,100,500],
    'max_samples': [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
    }


search = GridSearchCV(BaggingClassifier(), parameters, cv=5)


search.fit(X_train,y_train)

In [None]:
search.best_params_

{'bootstrap': True,
 'max_features': 0.7,
 'max_samples': 0.4,
 'n_estimators': 50}

In [None]:
search.best_score_

0.921125

##Bagging Regressor

In [65]:
from sklearn import datasets
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [62]:
boston = pd.read_csv('BostonHousing.csv')
boston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [63]:
X_boston = boston.iloc[:, :-1].values  # All rows, all columns except the last one
Y_boston = boston.iloc[:, -1].values
print('Dataset features names: ', boston.columns[:-1].tolist())
print('Dataset features size: ', X_boston.shape)
print('Dataset target size: ', Y_boston.shape)

Dataset features names:  ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat']
Dataset features size:  (506, 13)
Dataset target size:  (506,)


In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_boston, Y_boston , train_size=0.80, test_size=0.20, random_state=123)
print('Train/Test Sets Sizes : ',X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

Train/Test Sets Sizes :  (404, 13) (102, 13) (404,) (102,)


In [66]:
# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)  # Use the same imputer to transform the test set

# Initialize the models
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

# Fit the models
lr.fit(X_train, Y_train)
dt.fit(X_train, Y_train)
knn.fit(X_train, Y_train)

In [67]:
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = knn.predict(X_test)

In [68]:
print("R^2 score for LR",r2_score(Y_test,y_pred1))
print("R^2 score for DT",r2_score(Y_test,y_pred2))
print("R^2 score for KNN",r2_score(Y_test,y_pred3))

R^2 score for LR 0.6593498476094108
R^2 score for DT 0.4397696840596721
R^2 score for KNN 0.5475962186976784


In [69]:
from sklearn.ensemble import BaggingRegressor

bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train, Y_train)


In [70]:
Y_preds = bag_regressor.predict(X_test)

print('Training Coefficient of R^2 : %.3f'%bag_regressor.score(X_train, Y_train))
print('Test Coefficient of R^2 : %.3f'%bag_regressor.score(X_test, Y_test))

Training Coefficient of R^2 : 0.980
Test Coefficient of R^2 : 0.820


In [72]:
%%time

n_samples = X_boston.shape
n_features = Y_boston.shape

params = {'base_estimator': [None, LinearRegression(), KNeighborsRegressor()],
          'n_estimators': [20,50,100],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=3, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train, Y_train)

print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_test, Y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Train R^2 Score : 0.984
Test R^2 Score : 0.801
Best R^2 Score Through Grid Search : 0.869
Best Parameters :  {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 50}
CPU times: user 1.32 s, sys: 148 ms, total: 1.47 s
Wall time: 1min 1s
