In [1]:
!pip install datasets

%load_ext autoreload
%autoreload 2



In [2]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

adult_dataset_train = load_dataset("mstz/adult", "income")["train"].to_pandas()
adult_dataset_test = load_dataset("mstz/adult", "income")["test"].to_pandas()

adult_dataset_train.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,age,capital_gain,capital_loss,education,final_weight,hours_worked_per_week,marital_status,native_country,occupation,race,relationship,is_male,workclass,over_threshold
0,43,0.0,0.0,10,34278,35,Married-civ-spouse,United-States,Sales,White,Husband,True,Private,0
1,23,0.0,0.0,3,244698,35,Never-married,Mexico,Farming-fishing,White,Other-relative,True,Private,0
2,39,0.0,0.0,10,118286,40,Married-civ-spouse,United-States,Sales,Black,Husband,True,Private,0
3,35,0.0,0.0,9,126675,46,Divorced,?,Craft-repair,White,Not-in-family,True,Private,0
4,66,0.0,0.0,13,28367,99,Married-civ-spouse,United-States,Priv-house-serv,White,Other-relative,True,Private,0


In [3]:
# Preprocessing
# filtering categorical features out, currently not directly handled by most tree induction algorithms
adult_dataset_train = adult_dataset_train.select_dtypes(include="number")
adult_dataset_test = adult_dataset_test.select_dtypes(include="number")
target_feature = "over_threshold"

train_features, train_labels = adult_dataset_train.values[:, :-1], adult_dataset_train.values[:, -1].astype(int)
train_features, validation_features, train_labels, validation_labels = train_test_split(train_features, train_labels, test_size=.2,
                                                                                        stratify=train_labels)
test_features, test_labels = adult_dataset_test.values[:, :-1], adult_dataset_test.values[:, -1].astype(int)

## Tree induction

In [4]:
from ohmt.trees.multivariate.omnivariate import OmnivariateDT
from ohmt.trees.splits.evaluation import gini

df = load_dataset("mstz/adult", "income")["train"].to_pandas()
df_test = load_dataset("mstz/adult", "income")["test"].to_pandas()

tree = OmnivariateDT()
tree = tree.fit(train_features, train_labels, max_depth=8, min_eps=0.000000000000001, min_samples=10,
                node_fitness_function=gini)



Decision trees offer several fields for directly access the tree structure and parameters: 

In [5]:
# nodes inside the tree: dictionary with nodes named in a breadth-first manner
tree.nodes

{1: coefficients: [ 0. -1.  0.  0.  0.  0.]
 bound: -5119.0),
 2: coefficients: [ 0. -1.  0.  0.  0.  0.]
 bound: -7073.5),
 4: coefficients: [-1.  0.  0.  0.  0.  0.]
 bound: -20.5),
 8: coefficients: [ 0.  0.  0.  0. -1.  0.]
 bound: -25206.0),
 16: coefficients: [-1.  0.  0.  0.  0.  0.]
 bound: -86.5),
 32: [0.3333333333333333, 0.6666666666666666],
 33: coefficients: [ 0.  0.  0.  0.  0. -1.]
 bound: -15.5),
 66: coefficients: [ 0. -1.  0.  0.  0.  0.]
 bound: -7565.5),
 132: [0.00202020202020202, 0.997979797979798],
 133: [0.997979797979798, 0.00202020202020202],
 67: coefficients: [-1.47006705e-03  6.72471409e-05  0.00000000e+00  1.97983156e-01
   1.71258786e-05 -3.10745072e-01]
 bound: [0.]),
 134: [1.0, 0.0],
 135: [0.0, 1.0],
 17: coefficients: [ 0.          0.00234971  0.          0.56637576 -0.00106625  0.        ]
 bound: [0.]),
 34: [1.0, 0.0],
 35: [0.0, 1.0],
 9: [0.7142857142857143, 0.2857142857142857],
 5: [0.3712574850299401, 0.6287425149700598],
 3: coefficients: [ 0

In [6]:
# access parameters of internal nodes
print(tree.nodes[2].hyperplane)

Hyperplane
	coefficients: [0.0, -1.0, 0.0, 0.0, 0.0, 0.0]
	bound: -7073.5


In [7]:
# access tree structure, e.g., descendants
print(f"descendants: {tree.descendants[2]} and ancestors: {tree.ancestors[2]}")  

descendants: [4, 5, 8, 9, 16, 17, 32, 33, 34, 35, 66, 67, 132, 133, 134, 135] and ancestors: [1]


## Validation

In [8]:
from sklearn.metrics import classification_report


predicted_train_labels = tree.predict(train_features) 
predicted_validation_labels = tree.predict(validation_features) 
predicted_test_labels = tree.predict(test_features)

train_report = classification_report(train_labels, predicted_train_labels)
validation_report = classification_report(validation_labels, predicted_validation_labels)
test_report = classification_report(test_labels, predicted_test_labels)

print(f"Train report: {train_report}")
print(f"Train report: {validation_report}")
print(f"Test report: {test_report}")

Train report:               precision    recall  f1-score   support

           0       0.76      0.82      0.79     22292
           1       0.22      0.16      0.19      7012

    accuracy                           0.66     29304
   macro avg       0.49      0.49      0.49     29304
weighted avg       0.63      0.66      0.64     29304

Train report:               precision    recall  f1-score   support

           0       0.75      0.82      0.78      5574
           1       0.20      0.15      0.17      1753

    accuracy                           0.66      7327
   macro avg       0.48      0.48      0.48      7327
weighted avg       0.62      0.66      0.64      7327

Test report:               precision    recall  f1-score   support

           0       0.75      0.81      0.78      9289
           1       0.21      0.15      0.18      2922

    accuracy                           0.66     12211
   macro avg       0.48      0.48      0.48     12211
weighted avg       0.62      0.66

## Pruning
Pruning the tree post-training: pruning generates a **new** tree!

In [9]:
from ohmt.pruning import DepthGardener

pruner = DepthGardener()
reports_per_pruning = list()
for d in range(2, max(tree.depth.values()) - 1):
    print(f"Pruning at maximum depth {d}")
    pruned_tree = pruner.prune(tree, max_depth=d)
    
    predicted_train_labels = pruned_tree.predict(train_features) 
    predicted_validation_labels = pruned_tree.predict(validation_features) 
    predicted_test_labels = pruned_tree.predict(test_features)
    train_report = classification_report(train_labels, predicted_train_labels)
    validation_report = classification_report(validation_labels, predicted_validation_labels)
    test_report = classification_report(test_labels, predicted_test_labels)
    
    reports_per_pruning.append((train_report,
                                validation_report,
                                test_report))

Pruning at maximum depth 2
Pruning at maximum depth 3
Pruning at maximum depth 4
Pruning at maximum depth 5
Pruning at maximum depth 6


In [10]:
import numpy
from ohmt.pruning import GreedyBottomUpGardener

pruner = GreedyBottomUpGardener()
pruned_tree = pruner.prune(tree, validation_data=validation_features, validation_labels=validation_labels,
                           classes=numpy.array([0, 1]),
                           node_fitness_function=gini)

predicted_train_labels = pruned_tree.predict(train_features) 
predicted_validation_labels = pruned_tree.predict(validation_features) 
predicted_test_labels = pruned_tree.predict(test_features)
train_report = classification_report(train_labels, predicted_train_labels)
validation_report = classification_report(validation_labels, predicted_validation_labels)
test_report = classification_report(test_labels, predicted_test_labels)



In [11]:
print(test_report)

              precision    recall  f1-score   support

           0       0.80      1.00      0.88      9289
           1       0.95      0.19      0.31      2922

    accuracy                           0.80     12211
   macro avg       0.87      0.59      0.60     12211
weighted avg       0.83      0.80      0.75     12211

