# Import Libraries

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [94]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [39]:
import os

# Import datasets

In [40]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/iris")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/iris


In [41]:
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/iris/Iris.csv
/kaggle/input/iris/database.sqlite


In [42]:
df = pd.read_csv(path + '/Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


# EDA

## Preprocessing

In [43]:
df.drop(columns=['Id'], axis=1, inplace=True) # Use the correct drop method

In [44]:
scaler = LabelEncoder()
df['Species'] = scaler.fit_transform(df['Species'])

In [45]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Features Selection

In [46]:
df.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [47]:
# df = df.drop(columns=['SepalWidthCm', 'PetalLengthCm'])
# df.head()

# Train Test Split

In [48]:
X = df.drop(columns=['Species'])
y = df['Species']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vanilla Decision Tree Classifier

In [50]:
dtc1 = DecisionTreeClassifier()
dtc1.fit(X_train, y_train)
y_pred = dtc1.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 1.0


# Bagging using DecisionTreeClassifier

In [51]:
dtc2 = DecisionTreeClassifier()

In [52]:
bag1 = BaggingClassifier(
    estimator = dtc2,
    n_estimators = 100,
    max_samples = 0.25,
    bootstrap = True,
    random_state = 42
    )
bag1.fit(X_train, y_train)

## Prediction

In [53]:
y_pred2 = bag1.predict(X_test)
print(f'Accuracy Score: {np.round(accuracy_score(y_test, y_pred2), 4)}')

Accuracy Score: 1.0


In [54]:
bag1.estimators_samples_[0] ## Return The row number which uses in Estimator 1

array([ 91,  41,  74,  76,   2,   9, 103,  92,  71,  88,  19, 114,  61,
        28, 105,  92,  84,   6,  13,  68,  17,  82,  51, 105,  74, 113,
        34,  93,   4,  34])

In [55]:
bag1.estimators_features_[0] ## Return The column number which uses in Estimator 1

array([0, 1, 2, 3])

# Bagging Using SVC

In [57]:
svc1 = SVC(probability=True, kernel='poly')

In [61]:
bag2 = BaggingClassifier(
    estimator = svc1,
    n_estimators = 100,
    max_samples = 0.25,
    bootstrap = True,
    random_state = 42
    )

In [62]:
y_pred3 = bag2.fit(X_train, y_train).predict(X_test)
print(f'Accuracy Score: {np.round(accuracy_score(y_test, y_pred3), 4)}')

Accuracy Score: 1.0


In [63]:
bag2.estimators_samples_[0] ## Return The row number which uses in Estimator 1

array([ 91,  41,  74,  76,   2,   9, 103,  92,  71,  88,  19, 114,  61,
        28, 105,  92,  84,   6,  13,  68,  17,  82,  51, 105,  74, 113,
        34,  93,   4,  34])

In [64]:
bag2.estimators_features_[0] ## Return The column number which uses in Estimator 1

array([0, 1, 2, 3])

# Pasting

In [66]:
bag3 = BaggingClassifier(
    estimator = dtc2,
    n_estimators = 100,
    max_samples = 0.25,
    bootstrap = False,
    random_state = 42,
    verbose = 1,
    n_jobs = -1 # Using All Cores of Processor
    )

In [67]:
bag3.fit(X_train, y_train)
y_pred4 = bag3.predict(X_test)
print(f'Accuracy Score: {np.round(accuracy_score(y_test, y_pred4), 4)}')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Accuracy Score: 1.0


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.9s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.1s finished


# Random Subspaces

In [82]:
bag4 = BaggingClassifier(
    estimator = dtc2,
    n_estimators = 500,

    max_samples = 1.0,
    bootstrap = False,

    max_features = 0.5,
    bootstrap_features = True,

    random_state = 42,
    verbose = 1,
    n_jobs = -1 # Using All Cores of Processor
    )

In [83]:
bag4.fit(X_train, y_train)
y_pred5 = bag4.predict(X_test)
print(f'Accuracy Score: {np.round(accuracy_score(y_test, y_pred5), 4)}')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.9s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Accuracy Score: 1.0


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished


In [85]:
bag4.estimators_samples_[0].shape ## Return The row number which uses in Estimator 1

(120,)

In [87]:
bag4.estimators_features_[0].shape ## Return The column number which uses in Estimator 1


(2,)

# Random Patches

In [88]:
bag5 = BaggingClassifier(
    estimator = dtc2,
    n_estimators = 500,

    max_samples = 0.25,
    bootstrap = True,

    max_features = 0.5,
    bootstrap_features = True,

    random_state = 42,
    verbose = 1,
    n_jobs = -1 # Using All Cores of Processor
    )

In [89]:
bag5.fit(X_train, y_train)
y_pred6 = bag5.predict(X_test)
print(f'Accuracy Score: {np.round(accuracy_score(y_test, y_pred6), 4)}')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.9s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Accuracy Score: 1.0


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s finished


# OOB Score (Out of Bag)

In [90]:

bag6 = BaggingClassifier(

    estimator = dtc2,
    n_estimators = 500,

    max_samples = 0.25,
    bootstrap = True,
    oob_score = True,

    random_state = 42,
    verbose = 1,
    n_jobs = -1 # Using All Cores of Processor
)

In [91]:
bag6.fit(X_train, y_train)
y_pred7 = bag6.predict(X_test)
print(f'Accuracy Score: {np.round(accuracy_score(y_test, y_pred7), 4)}')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.5s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished


Accuracy Score: 1.0


In [93]:
bag6.oob_score_

0.9416666666666667

# Bagging Tips
- `Bagging` generally gives better results than `Pasting`
- Good results come around the `25%` to `50%` row sampling mark
- `Random patches` and `Subspaces` should be used while dealing with high dimensional data
- To find the correct hyperparameter values we can do `GridSearchCV`/`RandomSearchCV`

# Grid Search CV

```
For Find The Right Parameter

```

In [95]:
parameters = {
    'n_estimators': [50, 100, 500],
    'max_samples': [0.25, 0.5, 0.75, 0.9],
    'bootstrap' : [True, False],
    'max_features' : [0.25, 0.4, 0.6, 0.75],
    'bootstrap_features' : [True, False]
    }

In [98]:
search = GridSearchCV(BaggingClassifier(), parameters, cv=5, n_jobs = -1)

In [99]:
search.fit(X_train, y_train)

In [100]:
search.best_params_

{'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 0.75,
 'max_samples': 0.25,
 'n_estimators': 50}

In [101]:
search.best_estimator_

In [102]:
search.best_score_

np.float64(0.9583333333333334)