## Compare the performance of the scikit and the custom decision tree regressors

Tested on the scikit Diabetes toy dataset.

## Import of the dataset

In [1]:
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np

# Iris dataset contains 3 classes and 4 features
data = load_diabetes()
#convert to a dataframe
x = pd.DataFrame(data.data, columns = data.feature_names)
#create the species column
y = pd.DataFrame(data.target)
y = y.iloc[:,0]

print(x.head())
print(y[:5])


        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  
0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: 0, dtype: float64


## Split the dataset into training and testing sets

In [2]:
#split the dataset

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

## Search Space for Hyperparameter Tuning

In [3]:
params = dict()
# max depth list from 1 to 20
params['max_depth'] = list(range(1, 12, 2))
params['min_samples_split'] = list(range(2, 102, 10))
params['min_samples_leaf'] = list(range(1, 101, 10))
print(params)

{'max_depth': [1, 3, 5, 7, 9, 11], 'min_samples_split': [2, 12, 22, 32, 42, 52, 62, 72, 82, 92], 'min_samples_leaf': [1, 11, 21, 31, 41, 51, 61, 71, 81, 91]}


## K-Fold Cross Validation

In [4]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, KFold

# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
cv = KFold(n_splits=10, shuffle=True, random_state=1)

## Initialize Optimizer

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

clf = DecisionTreeRegressor(random_state=42)
search = GridSearchCV(estimator=clf, param_grid=params, cv=5, scoring='r2', verbose=0, n_jobs=-1)

from my_tree import CustomDecisionTreeRegressor
from fast_parameter_search import MyGridSearchCV

my_clf = CustomDecisionTreeRegressor()

my_search = MyGridSearchCV(my_clf, params, cv=cv, n_jobs=-1)


## Results

In [14]:
import time
start = time.time()
#Scikit Learn Decision Tree
search.fit(x_train, y_train)
end = time.time()
print("Scikit Learn Decision Tree")
print("Best score", search.best_score_)
print("Best parameters", search.best_params_)
print("Time taken to fit the scikit model", end-start)

# My Decision Tree
# Time how long it takes to fit the model
start = time.time()
my_search.fit(x_train, y_train)
end = time.time()
print("Custom Decision Tree")
print("Best score", my_search.best_score)
print("Best parameters", my_search.best_params)
print("Time taken to fit the custom model", end-start)



Scikit Learn Decision Tree
Best score 0.35090510941645536
Best parameters {'max_depth': 3, 'min_samples_leaf': 51, 'min_samples_split': 2}
Time taken to fit the scikit model 2.0333940982818604
Custom Decision Tree
Best score 0.34467539052044505
Best parameters {'max_depth': 7, 'min_samples_split': 62, 'min_samples_leaf': 1}
Time taken to fit the custom model 100.95859003067017


## Test the Models

In [15]:
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Get the best estimator
best = search.best_estimator_
# Fit the model
best.fit(x_train, y_train)
# Predict the test data
y_pred = best.predict(x_test)
# Get model score
score = r2_score(y_test, y_pred)
print(f"R2 score scikit regression tree: {score}")

best_custom = my_search.best_estimator
best_custom.fit(x_train, y_train)
y_pred = best_custom.predict(x_test)
score_custom = r2_score(y_test, y_pred)
print(f"R2 score custom regression tree: {score_custom}")

R2 score scikit regression tree: 0.3666575524702239
R2 score custom regression tree: 0.40531738249073135
