## Compare the performance of the scikit and the custom gradient boosting implementations

Tested on the scikit Diabetes toy dataset.

## Import of the dataset

In [2]:
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np

# Iris dataset contains 3 classes and 4 features
data = load_diabetes()
#convert to a dataframe
x = pd.DataFrame(data.data, columns = data.feature_names)
#create the species column
y = pd.DataFrame(data.target)
y = y.iloc[:,0]

print(x.head())
print(y[:5])


        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  
0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: 0, dtype: float64


## Split the dataset into training and testing sets

In [3]:
#split the dataset

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

## Search Space for Hyperparameter Tuning

In [4]:
params = dict()
# max depth list from 1 to 20
params['max_depth'] = list(range(1, 14, 3))
# params['min_samples_split'] = list(range(2, 102, 20))
# params['min_samples_leaf'] = list(range(1, 101, 20))
params['max_features'] = ['sqrt']
params['n_estimators'] = [100] #list(range(50, 551, 100))
params['learning_rate'] = [0.01, 0.1]

print(params)

{'max_depth': [1, 4, 7, 10, 13], 'max_features': ['sqrt'], 'n_estimators': [100], 'learning_rate': [0.01, 0.1]}


## K-Fold Cross Validation

In [5]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, KFold

# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
cv = KFold(n_splits=5, shuffle=True, random_state=1)

## Initialize Optimizer

In [6]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

clf = GradientBoostingRegressor(random_state=42)
search = GridSearchCV(estimator=clf, param_grid=params, cv=cv, scoring='r2', verbose=0, n_jobs=-1)

from gradient_boosting import CustomGradientBoostingRegressor
from fast_parameter_search import MyGridSearchCV

my_clf = CustomGradientBoostingRegressor()
scikit_params = params.copy()
scikit_params['tree_type'] = ['scikit']
my_search_scikit = MyGridSearchCV(my_clf, scikit_params, cv,n_jobs=-1)

my_clf_2 = CustomGradientBoostingRegressor()
custom_params = params.copy()
custom_params['tree_type'] = ['custom']
my_search_custom = MyGridSearchCV(my_clf_2, custom_params, cv, n_jobs=-1)


## Results

In [7]:
import time
start = time.time()
#Scikit Learn Decision Tree
#Scikit learn boosting
search.fit(x_train, y_train)
print("Scikit Learn Random Forest")
print("Scikit Learn Decision Tree")
print("Best score", search.best_score_)
print("Best parameters", search.best_params_)
print("Time taken scikit:", time.time()-start)

#Scikit Learn Decision Tree
#Custom boosting
start = time.time()
my_search_scikit.fit(x_train, y_train)
print("Custom Random Forest")
print("Scikit Learn Decision Tree")
print("Best score", my_search_scikit.best_score)
print("Best parameters", my_search_scikit.best_params)
print("Time taken custom:", time.time()-start)

# My Decision Tree
# My boosting
start = time.time()
my_search_custom.fit(x_train, y_train)
print("Custom Random Forest")
print("Custom Decision Tree")
print("Best score", my_search_custom.best_score)
print("Best parameters", my_search_custom.best_params)
print("Time taken custom:", time.time()-start)



KeyboardInterrupt: 

## Test the Models

In [8]:
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Get the best estimator
# best = search.best_estimator_
best = GradientBoostingRegressor(learning_rate=0.1, max_depth=1, n_estimators=10)
# Fit the model
best.fit(x_train, y_train)
# Predict the test data
y_pred = best.predict(x_test)
# Get model score
score = r2_score(y_test, y_pred)
print(f"R2 score scikit boosting: {score}")

# best_scikit = my_search_scikit.best_estimator
# best_scikit.print_params()
best_scikit = CustomGradientBoostingRegressor(learning_rate=0.1, max_depth=1, min_samples_leaf=1, n_estimators=10, tree_type='scikit')
best_scikit.fit(x_train, y_train)
y_pred = best_scikit.predict(x_test)
score_scikit = r2_score(y_test, y_pred)
print(f"R2 score scikit regression tree, custom boosting: {score_scikit}")

# best_custom = my_search_custom.best_estimator
# best_custom.print_params()
best_custom = CustomGradientBoostingRegressor(learning_rate=0.1, max_depth=1, n_estimators=10, tree_type='custom')
# best_custom = CustomDecisionTreeRegressor(max_depth=10, min_samples_split=2, min_samples_leaf=1)
best_custom.fit(x_train, y_train)
y_pred = best_custom.predict(x_test)
score_custom = r2_score(y_test, y_pred)
print(f"R2 score custom regression tree, custom boosting: {score_custom}")


R2 score scikit boosting: 0.032807571092408394
Tree 0 score: 0.057818791254574564
Tree 1 score: 0.10762965670133473
R2 score scikit regression tree, custom boosting: 0.08956362520133598
Tree 0 score: 0.05781879125457445
Tree 1 score: 0.10762965670133451
R2 score custom regression tree, custom boosting: 0.08956362520133587


In [None]:
# from sklearn.metrics import r2_score
# test_clf = CustomGradientBoostingRegressor(tree_type='scikit', learning_rate=0.1, max_depth=4, max_features='sqrt', n_estimators=5)
# test_clf.fit(x_train, y_train)
# y_pred = test_clf.predict(x_test)
# score_test = r2_score(y_test, y_pred)
# print(f"Test score: {score_test}")
# test_clf.fit(x_train, y_train)
# y_pred = test_clf.predict(x_test)
# score_test = r2_score(y_test, y_pred)
# print(f"Test score 2: {score_test}")
# test_clf.fit(x_train, y_train)
# y_pred = test_clf.predict(x_test)
# score_test = r2_score(y_test, y_pred)
# print(f"Test score 3: {score_test}")