In [35]:
from menelaus.drift_detector import BatchDetector
from menelaus.data_drift import CDBD, HDDDM, KdqTreeBatch
from menelaus.datasets import make_example_batch_data
import pandas as pd
import numpy as np

In [83]:
evaluators = {'eval_scheme_1': lambda x: print(x)}

class Ensembler():
    def __init__(self, detectors: dict, evaluator: str, constraints: dict = None):
        self.detectors = detectors.copy()
        self.evaluator = evaluators[evaluator]
        self.constraints = constraints

    def set_reference(self, X, y_true, y_pred):
        for det_key in self.detectors:
            # XXX - Cannot re-define X = constrain(), else external reference is modified
            #       Need to see why this is happening and where to put e.g. a copy() stmt.
            X_constrained = self.constrain(X, det_key)
            self.detectors[det_key].set_reference(X=X_constrained, y_true=y_true, y_pred=y_pred)

    def update(self, X, y_true=None, y_pred=None):
        for det_key in self.detectors:
            # XXX - Cannot re-define X = constrain(), else external reference is modified
            #       Need to see why this is happening and where to put e.g. a copy() stmt.
            X_constrained = self.constrain(X, det_key)
            self.detectors[det_key].update(X=X_constrained, y_true=y_true, y_pred=y_pred)
        self.evaluate()

    def constrain(self, data, det_key: str):
        # TODO - can y_true, y_pred be supported in this pattern?
        # TODO - this allows for list manipulation of PD columns
        #           will need to think about cases where numpy arrays
        #           are mixed in
        ret = data.copy()
        if self.constraints:
            constraint = self.constraints[det_key]
            ret = data[constraint]
        return ret

    def evaluate(self):
        self.drift_state = self.evaluator(self.detectors.values())

    def reset(self):
        for det_key in self.detectors:
            self.detectors[det_key].reset()


class BatchEnsembler(BatchDetector, Ensembler):
    def __init__(self, detectors: dict, evaluator: str, constraints: dict = None):
        BatchDetector.__init__(self)
        Ensembler.__init__(self, detectors, evaluator, constraints)

    def update(self, X, y_true=None, y_pred=None):
        Ensembler.update(self, X=X, y_true=y_true, y_pred=y_pred)
        BatchDetector.update(self, X=X, y_true=y_true, y_pred=y_pred)

    def reset(self):
        Ensembler.reset(self)
        BatchDetector.reset(self)

    def set_reference(self, X, y_true=None, y_pred=None):
        Ensembler.set_reference(self, X=X, y_true=y_true, y_pred=y_pred)

In [74]:
data = make_example_batch_data()
first_batch = pd.DataFrame(data[data.year == 2007])
test_batches = data[data.year.apply(lambda x: True if x in [2008, 2009] else False)]
detectors = {
    'cdbd': CDBD(),
    'kdq': KdqTreeBatch(bootstrap_samples=2),
    'hdddm': HDDDM(),
}
constraints = {
    'cdbd': ['confidence'],
    'kdq': [c for c in 'abcdefghij'], 
    'hdddm': data.columns[1:-2]
}

In [84]:
ensemble = BatchEnsembler(detectors, 'eval_scheme_1', constraints)
ensemble.set_reference(X=first_batch)
for year, subset in test_batches.groupby('year'):
    ensemble.update(X=subset)
    print(f"For {year}: {ensemble.drift_state}")

dict_values([<menelaus.data_drift.cdbd.CDBD object at 0x000001DBA4E03400>, <menelaus.data_drift.kdq_tree.KdqTreeBatch object at 0x000001DBA4E03940>, <menelaus.data_drift.hdddm.HDDDM object at 0x000001DBA4E02E60>])
For 2008: None
dict_values([<menelaus.data_drift.cdbd.CDBD object at 0x000001DBA4E03400>, <menelaus.data_drift.kdq_tree.KdqTreeBatch object at 0x000001DBA4E03940>, <menelaus.data_drift.hdddm.HDDDM object at 0x000001DBA4E02E60>])
For 2009: None
