In [1]:
import sys
from pathlib import Path
project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

import numpy as np
import time
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier as SklearnDT

from arboresque import DecisionTreeClassifier

iris = datasets.load_iris()
wine = datasets.load_wine()

The iris dataset contains 150 samples of sepal and petal measurement data, 4 features in all, for three types of irises, Setosa, Versicolor and Virginica.

In [2]:
for i in [12, 13, 86, 140, 75]:
    print(iris.data[i], iris.target[i])

[4.8 3.  1.4 0.1] 0
[4.3 3.  1.1 0.1] 0
[6.7 3.1 4.7 1.5] 1
[6.7 3.1 5.6 2.4] 2
[6.6 3.  4.4 1.4] 1


In [3]:
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

clf = DecisionTreeClassifier()  # default criterion="gini"
clf.fit(X_train, y_train)

print("Iris classifier")
print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))

Iris classifier
Train accuracy: 1.0
Test accuracy: 0.9111111111111111


In [4]:
y_pred = clf.predict(X_test)

print("Classification report:\n")
print(classification_report(y_test, y_pred))

print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))

Classification report:

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        15
           1       0.79      1.00      0.88        15
           2       1.00      0.87      0.93        15

    accuracy                           0.91        45
   macro avg       0.93      0.91      0.91        45
weighted avg       0.93      0.91      0.91        45

Confusion matrix:

[[13  2  0]
 [ 0 15  0]
 [ 0  2 13]]


In [5]:
criterions = ["gini", "entropy"]

results = []
for crit in criterions:
    start = time.time()
    clf = DecisionTreeClassifier(criterion=crit)
    clf.fit(X_train, y_train)
    end = time.time()
    tm = end-start
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    print(f"Criterion: {crit}")
    print(f"    Train accuracy: {train_acc:.3f}")
    print(f"    Test accuracy:  {test_acc:.3f}")
    print(f"    Time: {tm:.3f}")
    print(f"    Depth: {clf.get_depth()}")
    print()

Criterion: gini
    Train accuracy: 1.000
    Test accuracy:  0.911
    Time: 0.032
    Depth: 4

Criterion: entropy
    Train accuracy: 1.000
    Test accuracy:  0.844
    Time: 0.025
    Depth: 7



In [6]:
clf.get_depth()

7

In [7]:
X_sample = X_test[:5]
y_sample = y_test[:5]

probs = clf.predict_proba(X_sample)
preds = clf.predict(X_sample)

print("Sample true labels: ", y_sample)
print("Predicted labels:   ", preds)
print("Predicted probs (rows):")
print(probs)
print("Row sums (should be 1):", probs.sum(axis=1))

Sample true labels:  [2 2 0 0 1]
Predicted labels:    [2 2 0 1 1]
Predicted probs (rows):
[[0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]
Row sums (should be 1): [1. 1. 1. 1. 1.]


In [8]:
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print()
print("Test 1: max_features variations")
print()

for max_feat in [None, 2, 0.5, 'sqrt', 'log2']:
    clf = DecisionTreeClassifier(max_features=max_feat, random_state=42)
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    print(f"max_features={max_feat}: Accuracy={acc:.3f}, Depth={clf.get_depth()}, Leaves={clf.get_n_leaves()}")

print()
print("Test 2: min_samples_split & min_samples_leaf")
print()

for min_split, min_leaf in [(2, 1), (10, 5), (20, 10), (0.1, 0.05)]:
    clf = DecisionTreeClassifier(min_samples_split=min_split, min_samples_leaf=min_leaf, random_state=42)
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    print(f"min_split={min_split}, min_leaf={min_leaf}: Acc={acc:.3f}, Depth={clf.get_depth()}, Leaves={clf.get_n_leaves()}")

print()
print("Test 3: min_impurity_decrease")
print()

for min_imp in [0.0, 0.01, 0.05, 0.1]:
    clf = DecisionTreeClassifier(min_impurity_decrease=min_imp, random_state=42)
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    print(f"min_impurity_decrease={min_imp}: Acc={acc:.3f}, Depth={clf.get_depth()}, Leaves={clf.get_n_leaves()}")

print()
print("Test 4: random_state reproducibility")
print()

clf1 = DecisionTreeClassifier(max_features=2, random_state=42)
clf1.fit(X_train, y_train)
pred1 = clf1.predict(X_test)

clf2 = DecisionTreeClassifier(max_features=2, random_state=42)
clf2.fit(X_train, y_train)
pred2 = clf2.predict(X_test)

print(f"Same random_state: Predictions identical? {np.array_equal(pred1, pred2)}")

clf3 = DecisionTreeClassifier(max_features=2, random_state=99)
clf3.fit(X_train, y_train)
pred3 = clf3.predict(X_test)

print(f"Different random_state: Predictions different? {not np.array_equal(pred1, pred3)}")

print()
print("Test 5: Categorical features")
print()

X_cat = np.column_stack([X, np.random.randint(0, 3, size=len(X))])
X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(categorical_features=[4], random_state=42)
clf.fit(X_train_cat, y_train)
acc = clf.score(X_test_cat, y_test)
print(f"With categorical feature at index 4: Accuracy={acc:.3f}")
print(f"Features after encoding: {clf.n_features} (was {X_cat.shape[1]})")

print()
print("Test 6: Comparison with sklearn")
print()

X, y = wine.data, wine.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

arbor_clf = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
)
arbor_clf.fit(X_train, y_train)
arbor_acc = arbor_clf.score(X_test, y_test)

sklearn_clf = SklearnDT(
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
)
sklearn_clf.fit(X_train, y_train)
sklearn_acc = sklearn_clf.score(X_test, y_test)

print(f"Arboresque implementation: Acc={arbor_acc:.3f}, Depth={arbor_clf.get_depth()}, Leaves={arbor_clf.get_n_leaves()}")
print(f"Sklearn: Acc={sklearn_acc:.3f}, Depth={sklearn_clf.get_depth()}, Leaves={sklearn_clf.get_n_leaves()}")
print(f"Accuracy difference: {abs(arbor_acc - sklearn_acc):.3f}")

print()
print("Test 7: Edge cases")
print()

clf = DecisionTreeClassifier(max_depth=1, min_samples_split=100, random_state=42)
clf.fit(X_train, y_train)
print(f"Very restrictive tree: Depth={clf.get_depth()}, Leaves={clf.get_n_leaves()}")

clf = DecisionTreeClassifier(min_samples_leaf=1, min_samples_split=2, random_state=42)
clf.fit(X_train, y_train)
print(f"Max flexibility tree: Depth={clf.get_depth()}, Leaves={clf.get_n_leaves()}")

print()
print("Test 8: predict_proba")
print()

clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test[:5])
preds = clf.predict(X_test[:5])

print("First 5 samples:")
for i in range(5):
    print(f"  Predicted class: {preds[i]}, Probabilities: {proba[i]}")
    print(f"  Sum of probabilities: {proba[i].sum():.3f}")

print("All tests completed.")
print()



Test 1: max_features variations



max_features=None: Accuracy=0.956, Depth=6, Leaves=10
max_features=2: Accuracy=1.000, Depth=7, Leaves=9
max_features=0.5: Accuracy=1.000, Depth=7, Leaves=9
max_features=sqrt: Accuracy=1.000, Depth=7, Leaves=9
max_features=log2: Accuracy=1.000, Depth=7, Leaves=9

Test 2: min_samples_split & min_samples_leaf

min_split=2, min_leaf=1: Acc=0.956, Depth=6, Leaves=10
min_split=10, min_leaf=5: Acc=1.000, Depth=4, Leaves=6
min_split=20, min_leaf=10: Acc=0.978, Depth=3, Leaves=5
min_split=0.1, min_leaf=0.05: Acc=1.000, Depth=4, Leaves=6

Test 3: min_impurity_decrease

min_impurity_decrease=0.0: Acc=0.956, Depth=6, Leaves=10
min_impurity_decrease=0.01: Acc=0.956, Depth=6, Leaves=8
min_impurity_decrease=0.05: Acc=0.978, Depth=2, Leaves=3
min_impurity_decrease=0.1: Acc=0.978, Depth=2, Leaves=3

Test 4: random_state reproducibility

Same random_state: Predictions identical? True
Different random_state: Predictions different? True

Test 5: Categorical features

With categorical feature at index 4: A

In [9]:
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

clf = SklearnDT(random_state=0)
clf.fit(X_train, y_train)
print("Iris sklearn DecisionTreeClassifier (default gini)")
print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

from time import time

for crit in ["gini", "entropy"]:
    t0 = time()
    clf = SklearnDT(criterion=crit, random_state=0)
    clf.fit(X_train, y_train)
    t1 = time()
    print(f"\nCriterion: {crit}")
    print("  Train accuracy:", round(clf.score(X_train, y_train), 3))
    print("  Test accuracy: ", round(clf.score(X_test, y_test), 3))
    print("  Fit time (s):  ", round(t1 - t0, 6))
    print(f"  Depth: {clf.get_depth()}")


Iris sklearn DecisionTreeClassifier (default gini)
Train accuracy: 1.0
Test accuracy: 0.9777777777777777

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.94      1.00      0.97        15
           2       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

Confusion matrix:
 [[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]

Criterion: gini
  Train accuracy: 1.0
  Test accuracy:  0.978
  Fit time (s):   0.003558
  Depth: 4

Criterion: entropy
  Train accuracy: 1.0
  Test accuracy:  0.956
  Fit time (s):   0.0
  Depth: 7


The above cells demonstrate the functionalities of the DecisionTreeClassifier in Arboresque, the next step is to see the advantage of handling categorical variables. For this I used the Adult Income dataset from the UCI Machine Learning repository, as it has a mix of categorical and numerical features.

In [10]:
from sklearn.datasets import fetch_openml
adult = fetch_openml("adult", version=2, as_frame=True)
print(adult.frame.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  int64   
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capital-gain    48842 non-null  int64   
 11  capital-loss    48842 non-null  int64   
 12  hours-per-week  48842 non-null  int64   
 13  native-country  47985 non-null  category
 14  class           48842 non-null  category
dtypes: category(9), int64(6)
memory usage: 2.7 MB
None


In [11]:
adult.target.unique()

['<=50K', '>50K']
Categories (2, object): ['<=50K', '>50K']

In [12]:
adult.data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States


In [14]:
# from info, categorical variables are [1,3,5,6,7,8,9,13]
X, y = adult.data, adult.target
X_np = X.to_numpy()
y_np = (y == '>50K').astype(int) # making it two classes, >50K 1 and <=50K 0
cat_inds = [1,3,5,6,7,8,9,13]
for col_idx in cat_inds:
    mask = pd.isna(X_np[:, col_idx])
    X_np[mask, col_idx] = 'Missing'
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.3, random_state=42)
clf = DecisionTreeClassifier(
    categorical_features=cat_inds,
    max_depth=10,
    min_samples_split=50,
    random_state=42
)

clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print(f"Accuracy: {acc:.3f}")
print(f"Depth: {clf.get_depth()}, Leaves: {clf.get_n_leaves()}")



Accuracy: 0.865
Depth: 10, Leaves: 165


With categorical indices, and some other parameters, this implementation got an accurcay of 0.861. To compare, I tested it without marking variables as categorical.

In [17]:
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("Arboresque with categorical features marked and treated as such.")
print()
clf1 = DecisionTreeClassifier(
    categorical_features=cat_inds,
    max_depth=10,
    min_samples_split=50,
    random_state=42
)

clf1.fit(X_train, y_train)
acc1 = clf1.score(X_test, y_test)
print(f"Accuracy: {acc1:.3f}")
print(f"Depth: {clf1.get_depth()}, Leaves: {clf1.get_n_leaves()}")
print()
print("Scikit-learn implementation")
print()
X_train_sklearn = X_train.copy()
X_test_sklearn = X_test.copy()

label_encoders = {}
for col_idx in cat_inds:
    le = LabelEncoder()
    X_train_sklearn[:, col_idx] = le.fit_transform(X_train_sklearn[:, col_idx])
    X_test_sklearn[:, col_idx] = le.transform(X_test_sklearn[:, col_idx])

clf2 = SklearnDT(
    max_depth=10,
    min_samples_split=50,
    random_state=42
)
clf2.fit(X_train_sklearn, y_train)
acc2 = clf2.score(X_test_sklearn, y_test)
print(f"Accuracy: {acc2:.3f}")
print(f"Depth: {clf2.get_depth()}, Leaves: {clf2.get_n_leaves()}")
print()
print("Arboresque without handling categorical variables as such.")
print()
clf3 = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=50,
    random_state=42
)

clf3.fit(X_train_sklearn, y_train)
acc3 = clf3.score(X_test_sklearn, y_test)
print(f"Accuracy: {acc3:.3f}")
print(f"Depth: {clf3.get_depth()}, Leaves: {clf3.get_n_leaves()}")

Arboresque with categorical features marked and treated as such.

Accuracy: 0.865
Depth: 10, Leaves: 165

Scikit-learn implementation

Accuracy: 0.861
Depth: 10, Leaves: 178

Arboresque without handling categorical variables as such.

Accuracy: 0.861
Depth: 10, Leaves: 178


In [21]:
print("Arboresque with categorical features marked and treated as such.")
print()
b1=time()
clf1 = DecisionTreeClassifier(
    categorical_features=cat_inds,
    max_depth=10,
    min_samples_split=50,
    random_state=42,
    max_features=0.8
)
clf1.fit(X_train, y_train)
acc1 = clf1.score(X_test, y_test)
e1=time()
print(f"Accuracy: {acc1:.3f}")
print(f"Depth: {clf1.get_depth()}, Leaves: {clf1.get_n_leaves()}")
print(f"Time: {e1-b1}")
print()

print("Arboresque without handling categorical variables as such.")
print()
b2=time()
clf2 = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=50,
    random_state=42,
    max_features=0.8
)

clf2.fit(X_train_sklearn, y_train)
acc2 = clf2.score(X_test_sklearn, y_test)
e2=time()
print(f"Accuracy: {acc2:.3f}")
print(f"Depth: {clf2.get_depth()}, Leaves: {clf2.get_n_leaves()}")
print(f"Time: {e2-b2}")

Arboresque with categorical features marked and treated as such.

Accuracy: 0.865
Depth: 10, Leaves: 174
Time: 177.29684400558472

Arboresque without handling categorical variables as such.

Accuracy: 0.860
Depth: 10, Leaves: 201
Time: 150.39629793167114
