# Running Parallel OSDT

In [None]:
# third-party imports
from time import time

# local imports
from lib.models.parallel_osdt_classifier import ParallelOSDTClassifier
from lib.data_structures.dataset import read_dataframe

# Using COMPAS as an example
dataset = read_dataframe('data/preprocessed/census.csv') 
(n, m) = dataset.shape
X = dataset.values[:n,:m-1]
y = dataset.values[:n,-1]

hyperparameters = {
    'regularization': 0.1, # Regularization coefficient which effects the penalty on model complexity

    'max_depth': float('Inf'), # User-specified limit on the model
    'max_time': float('Inf'), # User-specified limit on the runtime 

    'workers': 1, # Parameter that varies based on how much computational resource is available

    'visualize': True, # Toggle whether a rule-list visualization is rendered
    'verbose': False, # Toggle whether event messages are printed
    'log': False, # Toggle whether client processes log to logs/work_<id>.log files
    'profile': False, # Toggle Snapshots for Profiling Memory Usage
    
    'configuration': { # More configurations around toggling optimizations and prioritization options
        'priority_metric': 'uniform', # Decides how tasks are prioritized
        'deprioritization': 0.01, # Decides how much to push back a task if it has pending dependencies

        # Note that Leaf Permutation Bound (Theorem 6) is 
        # Toggles the assumption about objective independence when composing subtrees (Theorem 1)
        # Disabling this actually breaks convergence due to information loss
        'hierarchical_lowerbound': True, 
        # Toggles whether problems are pruned based on insufficient accuracy (compared to other results) (Lemma 2)
        'look_ahead': True,
        # Toggles whether a split is avoided based on insufficient support (proxy for accuracy gain) (Theorem 3)
        'support_lowerbound': True,
        # Toggles whether a split is avoided based on insufficient potential accuracy gain (Theorem 4)
        'incremental_accuracy_lowerbound': True,
        # Toggles whether a problem is pruned based on insufficient accuracy (in general) (Theorem 5)
        'accuracy_lowerbound': True,
        # Toggles whether problem equivalence is based solely on the capture set (Similar to Corollary 6)
        'capture_equivalence': True,
        # Hamming distance used to propagate bounding information of similar problems (Theorem 7 + some more...)
        "similarity_threshold": 0,
        # Toggles whether equivalent points contribute to the lowerbound (Proposition 8 and Theorem 9)
        'equivalent_point_lowerbound': True,

        # Toggles compression of dataset based on equivalent point aggregation
        'equivalent_point_compression': False,
        # Toggles whether asynchronous tasks can be cancelled after being issued
        'task_cancellation': True,
        # Toggles whether look_ahead prunes using objective upperbounds (This builds on top of look_ahead)
        'interval_look_ahead': True,
        # Cooldown timer (seconds) on synchornization operations
        'synchronization_cooldown': 0.01,
        # Cache Limit
        'cache_limit': float('Inf')
    }
}

start = time()
model = ParallelOSDTClassifier(**hyperparameters)
model.fit(X, y)
prediction = model.predict(X)
prediction = prediction.reshape(1, n)
print('Runtime: {} Seconds'.format(time() - start))
print('Prediction: \n{}'.format(prediction))
print('Training Accuracy: {}'.format(model.score(X, y)))
print('Visualization: \n{}'.format(model.model.visualization))

Row Compression Factor: 1.504


# Running Sequential OSDT

In [8]:
# All dependencies of this notebook

# third-party imports
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

# local imports
from lib.models.osdt_classifier import OSDTClassifier
from lib.experiments.analysis import train_cross_validate
from lib.data_structures.dataset import read_dataframe

# Using COMPAS as an example
dataset = read_dataframe('data/preprocessed/census.csv') 
(n, m) = dataset.shape
X = dataset.values[:,:-1]
y = dataset.values[:,-1]

hyperparameters = {
    'regularization': 0.1, # Regularization coefficient which effects the penalty on model complexity
    'max_depth': float('Inf'), # User-specified limit on the model
    'max_time': float('Inf'), # User-specified limit on the runtime 
    
    'configuration': { # More configurations around toggling optimizations and prioritization options
        'priority_metric': 'curiosity',
        'look_ahead': True,
        'support_lowerbound': True,
        'incremental_accuracy_lowerbound': True,
        'accuracy_lowerbound': True,
        'equivalent_point_lowerbound': True,
    }
}

model = OSDTClassifier(**hyperparameters)
model.fit(X, y)
prediction = model.predict(X)
prediction = prediction.reshape(1, n)
print('Runtime: {} Seconds'.format(time() - start))
print('Prediction: \n{}'.format(prediction))
print('Training Accuracy: {}'.format(model.score(X, y)))

nrule: 331
ndata: 2500
gr: [2.34867144e-02 2.10133877e-03 3.25128590e-05 3.48267745e-03
 2.35034402e-03 4.60710225e-03 6.71334445e-02 4.75370524e-02
 1.23818180e-02 1.48757206e-02 1.83997212e-02 2.19774440e-02
 1.27934857e-03 1.84304866e-03 2.34867144e-02 3.69941921e-03
 3.20981155e-03 6.48703954e-04 3.12005194e-03 3.67152148e-03
 2.80167030e-03 3.09330444e-03 1.54061248e-03 2.68882733e-03
 9.93566346e-03 9.52988220e-03 2.16075381e-03 2.71545243e-04
 4.91306064e-06 3.09330444e-03 1.83746938e-04 4.35665932e-05
 8.18568411e-04 2.96152799e-04 1.32943974e-04 1.42407573e-04
 1.51414184e-04 9.44496191e-05 4.65667488e-04 4.44274023e-04
 0.00000000e+00 9.55634029e-07 0.00000000e+00 4.07815385e-05
 3.10571366e-04 4.21871282e-04 5.41274704e-05 1.71317353e-03
 4.49528142e-04 5.05269005e-04 1.36920602e-03 3.24949052e-04
 6.50158271e-04 0.00000000e+00 1.06227816e-04 3.16276748e-04
 4.28724906e-04 3.68640025e-03 2.21870127e-04 1.30135846e-03
 3.69625405e-03 5.91234027e-04 1.20367762e-03 1.72691404e-

>>> log: False
>>> support bound: True
>>> accu_support: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
prior_metric= curiosity
COUNT_UNIQLEAVES: 15516
COUNT_LEAFLOOKUPS: 146
total time:  5.536438703536987
lambda:  0.1
leaves:  [()]
num_captured:  [2500]
num_captured_incorrect:  [907]
prediction:  [1]
Objective:  0.4628
Accuracy:  0.6372
COUNT of the best tree:  0
time when the best tree is achieved:  1563743936.418797
TOTAL COUNT:  3299
best_is_cart False
Runtime: 37.541176080703735 Seconds
Prediction: 
[[1 1 1 ... 1 1 1]]
Training Accuracy: 0.6372


# Sample Experiment