In [11]:
import sys
from pathlib import Path
project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

import numpy as np
import time
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from arboresque import DecisionTreeRegressor

diabetes = datasets.load_diabetes()

The diabetes dataset has Samples total 442 samples, 10 features, all ranging from -0.2 to +0.2, and the target values are integers between 25 and 346. By default, all feature values are mean centered and scaled by the standard deviation times the square root of n_samples.

In [2]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [3]:
diabetes.data[0:6]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567042, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286131, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665608,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02268774, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187239,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03198764, -0.04664087],
       [-0.09269548, -0.04464164, -0.04069594, -0.01944183, -0.06899065,
        -0.07928784,  0.04127682, -0.0763945 , -0.04117617, -0.09634616]])

In [4]:
diabetes.target[0:6]

array([151.,  75., 141., 206., 135.,  97.])

In [5]:
X_reg, y_reg = diabetes.data, diabetes.target

Xtr, Xte, ytr, yte = train_test_split(X_reg, y_reg, test_size=0.3, random_state=0)

reg = DecisionTreeRegressor()  # default criterion="mse"
reg.fit(Xtr, ytr)

print("Diabetes regressor")
print("Train R^2:", reg.score(Xtr, ytr))
print("Test R^2:", reg.score(Xte, yte))

Diabetes regressor
Train R^2: 1.0
Test R^2: -0.19208394096108883


Low score on test data, possibly overfitting. Will try and add some stopping conditions.

In [6]:
reg.get_depth()

20

In [None]:
print("Diabetes regressor, experimenting with max depth stopping criteria")

for d in [2, 3, 4, 5, 6, None]:
    reg = DecisionTreeRegressor(max_depth=d)
    reg.fit(Xtr, ytr)
    print(
        "max_depth:", d,
        "train R^2:", reg.score(Xtr, ytr),
        "test R^2:", reg.score(Xte, yte),
    )

Diabetes regressor, experimenting with max depth stopping criteria
max_depth: 2 train R^2: 0.4867895939594755 test R^2: 0.2102166624565578
max_depth: 3 train R^2: 0.5744216142994143 test R^2: 0.1739704756705679
max_depth: 4 train R^2: 0.6577757989467647 test R^2: 0.1165400018798376
max_depth: 5 train R^2: 0.7321488177154096 test R^2: 0.041441111726097435
max_depth: 6 train R^2: 0.8076133804674929 test R^2: -0.096020218468025
max_depth: None train R^2: 1.0 test R^2: -0.19208394096108883


In [18]:
criterions = ["mse", "mae", "poisson", "friedman"]

results = []
for crit in criterions:
    cres = []
    for depth in [2,3,4,5,6]:
        start = time.time()
        reg = DecisionTreeRegressor(criterion=crit, max_depth=depth)
        reg.fit(Xtr, ytr)
        end = time.time()
        tm = end-start
        train_acc = reg.score(Xtr, ytr)
        test_acc = reg.score(Xte, yte)
        cres.append((train_acc, test_acc, tm, depth))
    best = 0
    for i in range(1, len(cres)):
        if cres[i][1]>cres[best][1]:
            best = i
    best_res = tuple((crit,)+cres[best])
    results.append(best_res)

for crit, train_acc, test_acc, tm, depth in results:
    print(f"Criterion: {crit}, Depth: {depth}")
    print(f"    Time: {tm:.3f}")
    print(f"    Train accuracy: {train_acc:.3f}")
    print(f"    Test accuracy:  {test_acc:.3f}")
    print()

Criterion: mse, Depth: 2
    Time: 0.200
    Train accuracy: 0.487
    Test accuracy:  0.210

Criterion: mae, Depth: 2
    Time: 0.240
    Train accuracy: 0.472
    Test accuracy:  0.174

Criterion: poisson, Depth: 3
    Time: 0.410
    Train accuracy: 0.552
    Test accuracy:  0.280

Criterion: friedman, Depth: 2
    Time: 0.125
    Train accuracy: 0.487
    Test accuracy:  0.210



Results to see how a basic scikit-learn regressor tree performs on the diabetes dataset.

In [22]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from time import time

diabetes = datasets.load_diabetes()
X_reg, y_reg = diabetes.data, diabetes.target

Xtr, Xte, ytr, yte = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=0
)

print("Diabetes sklearn DecisionTreeRegressor baseline (mse, no max_depth)")
reg = DecisionTreeRegressor(random_state=0)
reg.fit(Xtr, ytr)
print("Train R^2:", reg.score(Xtr, ytr))
print("Test R^2: ", reg.score(Xte, yte))

criterions = [
    ("squared_error", 2),
    ("absolute_error", 2),
    ("poisson", 3),
    ("friedman_mse", 2),
]

print("\nDiabetes criterion + depth comparison (sklearn)")
for crit, depth in criterions:
    t0 = time()
    reg = DecisionTreeRegressor(
        criterion=crit,
        max_depth=depth,
        random_state=0,
    )
    reg.fit(Xtr, ytr)
    t1 = time()
    train_r2 = reg.score(Xtr, ytr)
    test_r2 = reg.score(Xte, yte)
    print(f"\nCriterion: {crit}, Depth: {depth}")
    print("    Time:", f"{t1 - t0:.3f}")
    print("    Train R^2:", f"{train_r2:.3f}")
    print("    Test R^2: ", f"{test_r2:.3f}")


Diabetes sklearn DecisionTreeRegressor baseline (mse, no max_depth)
Train R^2: 1.0
Test R^2:  -0.1331965474132717

Diabetes criterion + depth comparison (sklearn)

Criterion: squared_error, Depth: 2
    Time: 0.008
    Train R^2: 0.487
    Test R^2:  0.210

Criterion: absolute_error, Depth: 2
    Time: 0.013
    Train R^2: 0.472
    Test R^2:  0.174

Criterion: poisson, Depth: 3
    Time: 0.003
    Train R^2: 0.552
    Test R^2:  0.280

Criterion: friedman_mse, Depth: 2
    Time: 0.000
    Train R^2: 0.487
    Test R^2:  0.210
