In [38]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
import pandas as pd
import numpy as np

In [39]:
class TNGradientBoost:
    def __init__(self, max_depth=8, min_samples_split=5, min_samples_leaf=5, max_features=3, learning_rate=0.1, num_iter=50):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.learning_rate = learning_rate
        self.num_iter = num_iter
        self.F0 = None
        self.trees = []

    def loss(y,y_pred):
        return 1/2 * 1/len(y) * np.sum(np.square(y - y_pred))

    def _gradient(self,y,y_pred):
        return np.array(y) - y_pred

    def _create_decision_tree(self,X,y):
        tree_regressor = DecisionTreeRegressor(max_depth=self.max_depth,
                                               min_samples_split=self.min_samples_split,
                                               min_samples_leaf=self.min_samples_leaf,
                                               max_features=self.max_features)
        tree_regressor.fit(X,y)
        return tree_regressor

    def fit(self,X,y):
        self.F0 = np.mean(y)
        pred = np.array([self.F0]* len(y)).reshape(-1,1)
        y = np.array(y).reshape(-1,1)
        for i in range(self.num_iter):
            grads = self._gradient(y, pred)
            base = self._create_decision_tree(X, grads)
            r = (base.predict(X)).reshape(len(X),1)
            pred += self.learning_rate * r
            self.trees.append(base)

    def predict(self,X):
        pred_0 = np.array([self.F0] * len(X))
        pred = pred_0.reshape(-1,1)
        for i in range(self.num_iter):
            temp = (self.trees[i].predict(X)).reshape(len(X),1)
            pred += self.learning_rate * temp
        return pred

## 1. Load dataset

In [40]:
df = pd.read_csv('advertising.csv')
df.head(5)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


## 2. Train test split

In [41]:
X = df.drop(columns='Sales')
y = df['Sales']

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

## 3. Using user-defined model

In [43]:
gg_mdl = TNGradientBoost()
gg_mdl.fit(X_train,y_train)

In [44]:
y_pred = gg_mdl.predict(X_test)

In [45]:
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {round(mse,3)}')
print(f'Mean absolute error: {round(mae,3)}')

Mean squared error: 1.954
Mean absolute error: 1.059


## 4. Using library's model

In [46]:
from xgboost import XGBRegressor

In [47]:
sk_mdl = XGBRegressor()
sk_mdl.fit(X_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [48]:
y_pred = sk_mdl.predict(X_test)

In [49]:
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {round(mse,3)}')
print(f'Mean absolute error: {round(mae,3)}')

Mean squared error: 2.098
Mean absolute error: 1.103
