In [26]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

Load Dataset

In [17]:
data = datasets.load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)

In [18]:
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y)

Make example GBM

In [7]:
learning_rate = 0.3
n_trees = 10
max_depth = 100

In [19]:
f_zero = y.mean()
Fm = f_zero
errors_clfs = []

In [21]:
# iterate over error tree classifiers
# we need to recalculate Fm in order to fit on the errors
for _ in range(n_trees):
    errors_clf = DecisionTreeRegressor(max_depth=max_depth)
    errors_clf.fit(X_train, y_train - Fm)
    Fm += learning_rate * errors_clf.predict(X_train)
    errors_clfs.append(errors_clf)

trees_predict = [t.predict(X_train) for t in errors_clfs]
y_hat = f_zero + learning_rate * np.sum(trees_predict, axis=0)

In [25]:
y_hat[:10]

array([ 2.8629174,  2.8629174,  2.8629174,  2.8629174,  2.8629174,
       -3.1370826,  2.8629174, -3.1370826, -3.1370826, -3.1370826])

Define classifier class

In [28]:
class GradientBoostedMachine:
    """Gradient Boosted Machine class"""

    def __init__(self, n_trees, learning_rate, max_depth) -> None:
        self.n_trees = n_trees
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.error_clfs = []
        self.f_zero = 0

    def fit(self, X, y) -> None:
        """Fit method for training the model"""
        self.f_zero = y.mean()
        f_m = self.f_zero
        for _ in range(self.n_trees):
            error_clf = DecisionTreeRegressor(max_depth=self.max_depth)
            error_clf.fit(X, y - f_m)
            f_m += self.learning_rate * error_clf.predict(X)
            self.error_clfs.append(error_clf)

    def predict(self, X) -> np.ndarray:
        """Predict method for making predictions"""
        predictions = [t.predict(X) for t in self.error_clfs]
        return self.f_zero + self.learning_rate + np.sum(predictions, axis=0)

Test our classifier against SKLearn's

In [None]:
sklearn_gbm = GradientBoostingRegressor(n_estimators=25, learning_rate=0.3, max_depth=1)
sklearn_gbm.fit(X, y)

scratch_gbm = GradientBoostedMachine(n_trees=25, learning_rate=0.3, max_depth=1)
scratch_gbm.fit(X, y)

mean_squared_error(y, sklearn_gbm.predict(X)), mean_squared_error(y, scratch_gbm.predict(X))