# Running Parallel OSDT

In [1]:
# third-party imports
from time import time

# local imports
from lib.models.parallel_osdt_classifier import ParallelOSDTClassifier
from lib.data_structures.dataset import read_dataframe

# Using COMPAS as an example
dataset = read_dataframe('data/preprocessed/compas-binary.csv') 
(n, m) = dataset.shape
print(n, m)
X = dataset.values[:n,:m-1]
y = dataset.values[:n,-1]

hyperparameters = {
    'regularization': 0.005, # Regularization coefficient which effects the penalty on model complexity

    'max_depth': float('Inf'), # User-specified limit on the model
    'max_time': float('Inf'), # User-specified limit on the runtime 

    'workers': 1, # Parameter that varies based on how much computational resource is available

    'visualize_model': True, # Toggle whether a rule-list visualization is rendered
    'visualize_training': False,  # Toggle whether a dependency graph is streamed at runtime
    'verbose': False, # Toggle whether event messages are printed
    'log': False, # Toggle whether client processes log to logs/work_<id>.log files
    'profile': False, # Toggle Snapshots for Profiling Memory Usage
    
    'configuration': { # More configurations around toggling optimizations and prioritization options
        'priority_metric': 'uniform', # Decides how tasks are prioritized
        'deprioritization': 0.01, # Decides how much to push back a task if it has pending dependencies

        # Note that Leaf Permutation Bound (Theorem 6) is 
        # Toggles the assumption about objective independence when composing subtrees (Theorem 1)
        # Disabling this actually breaks convergence due to information loss
        'hierarchical_lowerbound': True, 
        # Toggles whether problems are pruned based on insufficient accuracy (compared to other results) (Lemma 2)
        'look_ahead': True,
        # Toggles whether a split is avoided based on insufficient support (proxy for accuracy gain) (Theorem 3)
        'support_lowerbound': True,
        # Toggles whether a split is avoided based on insufficient potential accuracy gain (Theorem 4)
        'incremental_accuracy_lowerbound': True,
        # Toggles whether a problem is pruned based on insufficient accuracy (in general) (Theorem 5)
        'accuracy_lowerbound': True,
        # Toggles whether problem equivalence is based solely on the capture set (Similar to Corollary 6)
        'capture_equivalence': True,
        # Hamming distance used to propagate bounding information of similar problems (Theorem 7 + some more...)
        "similarity_threshold": 5,
        # Toggles whether equivalent points contribute to the lowerbound (Proposition 8 and Theorem 9)
        'equivalent_point_lowerbound': True,

        # Toggles compression of dataset based on equivalent point aggregation
        'equivalent_point_compression': True,
        # Toggles whether asynchronous tasks can be cancelled after being issued
        'task_cancellation': True,
        # Toggles whether look_ahead prunes using objective upperbounds (This builds on top of look_ahead)
        'interval_look_ahead': True,
        # Cooldown timer (seconds) on synchornization operations
        'synchronization_cooldown': 0.01,
        # Cache Limit
        'cache_limit': float('Inf')
    }
}

# distance = 75
# none = 4581
# low = 4581
# medium = 4581
# high = 4577

start = time()
model = ParallelOSDTClassifier(**hyperparameters)
model.fit(X, y)
prediction = model.predict(X)
prediction = prediction.reshape(1, n)
print('Runtime: {} Seconds'.format(time() - start))
print('Prediction: \n{}'.format(prediction))
print('Training Accuracy: {}'.format(model.score(X, y)))
print('Visualization: \n{}'.format(model.model.visualization))

6907 13
tables = 5041469194453614854144
Initializing Similarity Index


Process Process-2:
  File "/Users/Jimmy/Dropbox/NSS/repo/lib/parallel/actor.py", line 73, in __run__
    result = task(actor_id, services)
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.4_2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.6.4_2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/Jimmy/Dropbox/NSS/repo/lib/models/parallel_osdt.py", line 231, in task
    result = self.find_or_create_result(task.capture, task.path)
  File "/Users/Jimmy/Dropbox/NSS/repo/lib/models/parallel_osdt.py", line 313, in find_or_create_result
    self.update(capture, path, result)
  File "/Users/Jimmy/Dropbox/NSS/repo/lib/models/parallel_osdt.py", line 528, in update
    self.put(capture, path, result)
  File "/Users/Jimmy/Dropbox/NSS/repo/lib/models/paralle

KeyboardInterrupt: 

# Running Sequential OSDT

In [None]:
# All dependencies of this notebook

# third-party imports
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

# local imports
from lib.models.osdt_classifier import OSDTClassifier
from lib.experiments.analysis import train_cross_validate
from lib.data_structures.dataset import read_dataframe

# Using COMPAS as an example
dataset = read_dataframe('data/compression/1.csv') 
(n, m) = dataset.shape
X = dataset.values[:,:-1]
y = dataset.values[:,-1]

hyperparameters = {
    'regularization': 0.005, # Regularization coefficient which effects the penalty on model complexity
    'max_depth': float('Inf'), # User-specified limit on the model
    'max_time': float('Inf'), # User-specified limit on the runtime 
    
    'configuration': { # More configurations around toggling optimizations and prioritization options
        'priority_metric': 'curiosity',
        'look_ahead': True,
        'support_lowerbound': True,
        'incremental_accuracy_lowerbound': True,
        'accuracy_lowerbound': True,
        'equivalent_point_lowerbound': True,
    }
}

start = time()
model = OSDTClassifier(**hyperparameters)
model.fit(X, y)
prediction = model.predict(X)
prediction = prediction.reshape(1, n)
print('Runtime: {} Seconds'.format(time() - start))
print('Prediction: \n{}'.format(prediction))
print('Training Accuracy: {}'.format(model.score(X, y)))

# Sample Experiment