In [62]:
import sys
from pathlib import Path
project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

import numpy as np
import time
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from arboresque import DecisionTreeClassifier

iris = datasets.load_iris()

The iris dataset contains 150 samples of sepal and petal measurement data, 4 features in all, for three types of irises, Setosa, Versicolor and Virginica.

In [63]:
for i in [12, 13, 86, 140, 75]:
    print(iris.data[i], iris.target[i])

[4.8 3.  1.4 0.1] 0
[4.3 3.  1.1 0.1] 0
[6.7 3.1 4.7 1.5] 1
[6.7 3.1 5.6 2.4] 2
[6.6 3.  4.4 1.4] 1


In [None]:
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

clf = DecisionTreeClassifier()  # default criterion="gini"
clf.fit(X_train, y_train)

print("Iris classifier")
print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))

Iris classifier
Train accuracy: 1.0
Test accuracy: 0.9111111111111111


In [65]:
y_pred = clf.predict(X_test)

print("Classification report:\n")
print(classification_report(y_test, y_pred))

print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))

Classification report:

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        15
           1       0.79      1.00      0.88        15
           2       1.00      0.87      0.93        15

    accuracy                           0.91        45
   macro avg       0.93      0.91      0.91        45
weighted avg       0.93      0.91      0.91        45

Confusion matrix:

[[13  2  0]
 [ 0 15  0]
 [ 0  2 13]]


In [66]:
criterions = ["gini", "entropy"]

results = []
for crit in criterions:
    start = time.time()
    clf = DecisionTreeClassifier(criterion=crit)
    clf.fit(X_train, y_train)
    end = time.time()
    tm = end-start
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    print(f"Criterion: {crit}")
    print(f"    Train accuracy: {train_acc:.3f}")
    print(f"    Test accuracy:  {test_acc:.3f}")
    print(f"    Time: {tm:.3f}")
    print(f"    Depth: {clf.get_depth()}")
    print()

Criterion: gini
    Train accuracy: 1.000
    Test accuracy:  0.911
    Time: 0.043
    Depth: 5

Criterion: entropy
    Train accuracy: 1.000
    Test accuracy:  0.844
    Time: 0.024
    Depth: 8



In [67]:
clf.get_depth()

8

In [68]:
X_sample = X_test[:5]
y_sample = y_test[:5]

probs = clf.predict_proba(X_sample)
preds = clf.predict(X_sample)

print("Sample true labels: ", y_sample)
print("Predicted labels:   ", preds)
print("Predicted probs (rows):")
print(probs)
print("Row sums (should be 1):", probs.sum(axis=1))

Sample true labels:  [2 2 0 0 1]
Predicted labels:    [2 2 0 1 1]
Predicted probs (rows):
[[0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]
Row sums (should be 1): [1. 1. 1. 1. 1.]


In [69]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

iris = datasets.load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
print("Iris sklearn DecisionTreeClassifier (default gini)")
print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

from time import time

for crit in ["gini", "entropy"]:
    t0 = time()
    clf = DecisionTreeClassifier(criterion=crit, random_state=0)
    clf.fit(X_train, y_train)
    t1 = time()
    print(f"\nCriterion: {crit}")
    print("  Train accuracy:", round(clf.score(X_train, y_train), 3))
    print("  Test accuracy: ", round(clf.score(X_test, y_test), 3))
    print("  Fit time (s):  ", round(t1 - t0, 6))
    print(f"  Depth: {clf.get_depth()}")


Iris sklearn DecisionTreeClassifier (default gini)
Train accuracy: 1.0
Test accuracy: 0.9777777777777777

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.94      1.00      0.97        15
           2       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

Confusion matrix:
 [[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]

Criterion: gini
  Train accuracy: 1.0
  Test accuracy:  0.978
  Fit time (s):   0.0
  Depth: 4

Criterion: entropy
  Train accuracy: 1.0
  Test accuracy:  0.956
  Fit time (s):   0.0
  Depth: 7
