## Compare the performance of the scikit and the custom decision tree regressors

Tested on the scikit Diabetes toy dataset.

## Import of the dataset

In [1]:
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np

data = load_diabetes()
x = pd.DataFrame(data.data, columns = data.feature_names)
y = pd.DataFrame(data.target)
y = y.iloc[:,0]

print(x.head())
print(y[:5])


        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  
0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: 0, dtype: float64


## Split the dataset into training and testing sets

In [2]:
#split the dataset

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Comparison of the performance of the tree classifiers
### 1) Compare on predefined parameters

In [15]:
from sklearn.tree import DecisionTreeRegressor
from tree import CustomDecisionTreeRegressor

print("Tree Regressor Comparison for overfit parameters")
print("------------------------------------------------------")

# Default tree (no specified parameters)
scikit_model = DecisionTreeRegressor(max_depth=100, random_state=42)
scikit_model.fit(x_train, y_train)
scikit_score = scikit_model.score(x_test, y_test)
print("Scikit-learn model accuracy on test data:        ", "{:.2f}".format(scikit_score))
scikit_train_score = scikit_model.score(x_train, y_train)
print("Scikit-learn model accuracy on training data:    ", "{:.2f}".format(scikit_train_score))

# Custom tree (no specified parameters)
custom_model = CustomDecisionTreeRegressor(max_depth=100)
custom_model.fit(x_train, y_train)
custom_score = custom_model.score(x_test, y_test)
print("Custom model accuracy on test data:              ", "{:.2f}".format(custom_score))
custom_train_score = custom_model.score(x_train, y_train)
print("Custom model accuracy on training data:          ", "{:.2f}".format(custom_train_score))


Tree Regressor Comparison for overfit parameters
------------------------------------------------------
Scikit-learn model accuracy on test data:         0.06
Scikit-learn model accuracy on training data:     1.00
Custom model accuracy on test data:               -0.05
Custom model accuracy on training data:           1.00


In [4]:
print("Tree Regressor Comparison for random parameters")
print("------------------------------------------------------")
print("max_depth=3, min_samples_split=4, min_samples_leaf=5")
print("------------------------------------------------------")

# Default tree (specified parameters)
scikit_model = DecisionTreeRegressor(max_depth=3, min_samples_split=4, min_samples_leaf=5, random_state=42)
scikit_model.fit(x_train, y_train)
scikit_score = scikit_model.score(x_test, y_test)
print("Scikit-learn model accuracy on test data:        ", "{:.2f}".format(scikit_score))
scikit_train_score = scikit_model.score(x_train, y_train)
print("Scikit-learn model accuracy on training data:    ", "{:.2f}".format(scikit_train_score))

# Custom tree (specified parameters)
custom_model = CustomDecisionTreeRegressor(max_depth=3, min_samples_split=4, min_samples_leaf=5)
custom_model.fit(x_train, y_train)
custom_score = custom_model.score(x_test, y_test)
print("Custom model accuracy on test data:              ", "{:.2f}".format(custom_score))
custom_train_score = custom_model.score(x_train, y_train)
print("Custom model accuracy on training data:          ", "{:.2f}".format(custom_train_score))


Tree Regressor Comparison for random parameters
------------------------------------------------------
max_depth=3, min_samples_split=4, min_samples_leaf=5
------------------------------------------------------
Scikit-learn model accuracy on test data:         0.42
Scikit-learn model accuracy on training data:     0.52
Custom model accuracy on test data:               0.41
Custom model accuracy on training data:           0.52


### 2) Compare on the best parameters obtained by hyperparameter tuning

#### Hyperparameter space

In [5]:
params = dict()
# max depth list from 1 to 20
params['max_depth'] = list(range(1, 32, 2))
params['min_samples_split'] = list(range(2, 31, 2))
params['min_samples_leaf'] = list(range(1, 32, 2))
print(params)

{'max_depth': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31], 'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30], 'min_samples_leaf': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]}


#### K-Fold Cross Validation

In [6]:
from sklearn.model_selection import cross_val_score, KFold

cv = KFold(n_splits=5, shuffle=True)

#### Initialize Optimizer

In [7]:
from sklearn.model_selection import GridSearchCV
from fast_parameter_search import MyGridSearchCV

scikit_clf = DecisionTreeRegressor(random_state=42)
# add random state to make the results reproducible
scikit_params = params.copy()
scikit_params['random_state'] = [42]
scikit_search = GridSearchCV(estimator=scikit_clf, param_grid=scikit_params, cv=cv, scoring='r2', verbose=0, n_jobs=-1)

custom_clf = CustomDecisionTreeRegressor()

custom_search = MyGridSearchCV(custom_clf, params, cv=cv, n_jobs=-1)

#### Results

In [8]:
#Scikit Learn Decision Tree
scikit_search.fit(x_train, y_train)
print("Scikit Learn Decision Tree")
print("Best score", scikit_search.best_score_)
print("Best parameters", scikit_search.best_params_)

# Custom Decision Tree
custom_search.fit(x_train, y_train)
print("Custom Decision Tree")
print("Best score", custom_search.best_score)
print("Best parameters", custom_search.best_params)

Scikit Learn Decision Tree
Best score 0.349680634322731
Best parameters {'max_depth': 5, 'min_samples_leaf': 29, 'min_samples_split': 2, 'random_state': 42}
Custom Decision Tree
Best score 0.4151344416660413
Best parameters {'max_depth': 23, 'min_samples_split': 14, 'min_samples_leaf': 21}


In [9]:
from sklearn.metrics import r2_score

best_scikit_model = scikit_search.best_estimator_
best_scikit_model.fit(x_train, y_train)
y_pred_scikit = best_scikit_model.predict(x_test)
scikit_accuracy = r2_score(y_test, y_pred_scikit)

best_custom_model = custom_search.best_estimator
best_custom_model = CustomDecisionTreeRegressor(max_depth=best_scikit_model.max_depth, min_samples_split=best_scikit_model.min_samples_split, min_samples_leaf=best_scikit_model.min_samples_leaf)
best_custom_model.fit(x_train, y_train)
y_pred_custom = best_custom_model.predict(x_test)
custom_accuracy = r2_score(y_test, y_pred_custom)
custom_accuracy = custom_model.score(x_test, y_test)

print("Tree Regressor Comparison for best parameters")
print("------------------------------------------------------")
print("Scikit-learn model r2 score on test data:        ", "{:.2f}".format(scikit_accuracy))
print("Custom model r2 score on test data:              ", "{:.2f}".format(custom_accuracy))



Tree Regressor Comparison for best parameters
------------------------------------------------------
Scikit-learn model r2 score on test data:         0.43
Custom model r2 score on test data:               0.41
