<a href="https://colab.research.google.com/github/lobral2728/ucb_ml_capstone/blob/colab/Ensemble_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install libraries

In [None]:
! pip install xgboost



# Load libraries

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
import xgboost as xgb

# Load data

In [None]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

In [None]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train

array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
       1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
       1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1,
       0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
       1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 0, 1, 2, 0, 1, 2])



---



# Voting classifier
Train a few classifiers and aggregate their predictions. Then predict the class that gets the most votes.


In [None]:
# Instantiate the individual models
log_clf = LogisticRegression()
svm_clf = SVC(probability=True)
tree_clf = DecisionTreeClassifier()

In [None]:
# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('svc', svm_clf), ('tree', tree_clf)],
    voting='soft'
)

In [None]:
# Fit the model
voting_clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
print("Voting Classifier Accuracy:", voting_clf.score(X_test, y_test))

Voting Classifier Accuracy: 1.0




---



# Stacking

Instead of using trivial functions (such as hard voting) to aggregate the predictions of all predictors in an ensemble, why don't we train a model to perform this aggregation?


In [None]:
# Define base models
base_models = [
    ('lr', LogisticRegression()),
    ('svc', SVC(probability=True)),
    ('tree', DecisionTreeClassifier())
]

In [None]:
# Stacking classifier
stack_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression()
)

In [None]:
# Fit the model
stack_clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
print("Stacking Classifier Accuracy:", stack_clf.score(X_test, y_test))

Stacking Classifier Accuracy: 1.0




---



# Bagging with Decision Trees

Use the same classifier for each predictor, but train them on different random subsets of the training set

Bagging: Sampling is performed with replacement


In [None]:
# Bagging classifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)

In [None]:
# Fit the model
bag_clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
print("Bagging Classifier Accuracy:", bag_clf.score(X_test, y_test))

Bagging Classifier Accuracy: 1.0


# Pasting with Decision Trees
Pasting is similar to bagging; the only difference is setting `bootstrap=False` to use sampling without replacement.



In [None]:
# Pasting classifier (similar to the Bagging code, just switch `bootstrap` to False)
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=False, n_jobs=-1
)


In [None]:
# The rest of the code is the same as for bagging
bag_clf.fit(X_train, y_train)
print("Pasting Classifier Accuracy:", bag_clf.score(X_test, y_test))


Pasting Classifier Accuracy: 1.0


# Random Forest

The fundamental difference between Random Forest and bagging of Decision Trees is that in Random forests, only a subset of features are selected at random out of the total and the best split feature from the subset is used to split each node in a tree, unlike in bagging where all features are considered for splitting a node

- To simulate RF using bagging, initialize the DT as:

`dt = DecisionTreeClassifier(splitter='random')`



In [None]:
# Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100)

In [None]:
# Fit the model
rf_clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
print("Random Forest Classifier Accuracy:", rf_clf.score(X_test, y_test))

Random Forest Classifier Accuracy: 1.0




---



# Boosting with AdaBoost

- Particularly effective for binary classification tasks
- Focuses on converting weak learners into strong ones in a sequential manner.

In [None]:
# AdaBoost classifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200, learning_rate=0.5
)

In [None]:
# Fit the model
ada_clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
print("AdaBoost Classifier Accuracy:", ada_clf.score(X_test, y_test))

AdaBoost Classifier Accuracy: 0.9666666666666667


# Gradient Boosting

- Gradient Boosting is a more general and flexible boosting approach than AdaBoost
- It also builds models sequentially, but it uses the gradient of the loss function to guide the learning process


In [None]:
# Gradient Boosting classifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)

In [None]:
# Fit the model
gb_clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
print("Gradient Boosting Classifier Accuracy:", gb_clf.score(X_test, y_test))

Gradient Boosting Classifier Accuracy: 0.9666666666666667


# XGBoost

XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a massive improvement in speed and performance compared to traditional Gradient Boosting.

In [None]:
# XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Predictions
predictions = xgb_clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("XGBoost Classifier Accuracy:", accuracy)

XGBoost Classifier Accuracy: 1.0
