In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [8]:
X, y = make_classification(n_samples=10000, n_features=10, n_informative=3)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
'''
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)
'''

'\nfrom sklearn.preprocessing import StandardScaler\n\nsc = StandardScaler()\nX_train_sc = sc.fit_transform(X_train)\nX_test_sc = sc.transform(X_test)\n'

In [11]:
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

print("Decision Tree Accuracy: ", round(accuracy_score(y_test, y_pred_dt)*100,2), "%")

Decision Tree Accuracy:  91.6 %


### **Bagging**

In [12]:
bag = BaggingClassifier(estimator=DecisionTreeClassifier(),
                        n_estimators=500,
                        max_samples=0.5,
                        bootstrap=True,
                        random_state=42,
                        )

In [13]:
bag.fit(X_train, y_train)

In [14]:
y_pred_bag = bag.predict(X_test)

In [15]:
print("Bagged Classifier Accuracy: ", round(accuracy_score(y_test, y_pred_bag)*100,2), "%")

Bagged Classifier Accuracy:  94.0 %


### **Random Forest**

In [16]:
model_rf = RandomForestClassifier(random_state=42, n_estimators=500)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print("Random Forest Accuracy: ", round(accuracy_score(y_test, y_pred_rf)*100,2), "%")

Random Forest Accuracy:  93.85 %


### **Bagging using SVM**

In [18]:
bag_svm = BaggingClassifier(estimator=SVC(),
                        n_estimators=500,
                        max_samples=0.25,
                        bootstrap=True,
                        random_state=42,
                        )

In [19]:
bag_svm.fit(X_train, y_train)
y_pred_bagsvm = bag_svm.predict(X_test)
print("Bagged Classifier (SVM) Accuracy: ", round(accuracy_score(y_test, y_pred_bagsvm)*100,2), "%")

Bagged Classifier (SVM) Accuracy:  92.05 %


### **Pasting**

In [20]:
pasting = BaggingClassifier(estimator=DecisionTreeClassifier(),
                        n_estimators=500,
                        max_samples=0.25,
                        bootstrap=False,
                        random_state=42,
                        )

In [21]:
pasting.fit(X_train, y_train)
y_pred_pasting = pasting.predict(X_test)
print("Pasting Classifier Accuracy: ", round(accuracy_score(y_test, y_pred_pasting)*100,2), "%")

Pasting Classifier Accuracy:  93.95 %


Takeaways:

- Random Forest is better than Bagged models, and further Bagged models are better than Pasting
- Good results come around 25% to 50% row sampling
- In order to find the best parameters, we need to do hyper parameter optimization