In [34]:
from menelaus.drift_detector import BatchDetector
from menelaus.data_drift import CDBD, HDDDM, KdqTreeBatch
from menelaus.datasets import make_example_batch_data
import pandas as pd
import numpy as np

In [2]:
evaluators = {'eval_scheme_1': lambda x: print(x)}

class Ensembler():
    def __init__(self, detectors: dict, evaluator: str, constraints: dict = None):
        self.detectors = detectors
        self.evaluator = evaluators[evaluator]
        self.constraints = constraints

    def set_reference(self, X, y_true, y_pred):
        for det_key in self.detectors:
            X = self.constrain(X, det_key)
            self.detectors[det_key].set_reference(X=X, y_true=y_true, y_pred=y_pred)

    def update(self, X, y_true, y_pred):
        for det_key in self.detectors:
            X = self.constrain(X, det_key)
            self.detectors[det_key].update(X=X, y_true=y_true, y_pred=y_pred)
        self.evaluate()

    def constrain(self, data, det_key: str):
        # TODO - can y_true, y_pred be supported in this pattern?
        # TODO - this allows for list manipulation of PD columns
        #           will need to think about cases where numpy arrays
        #           are mixed in
        ret = data
        if self.constraints:
            constraint = self.constraints[det_key]
            ret = data[constraint]
        return ret

    def evaluate(self):
        self.drift_state = self.evaluator(self.detectors.values())

    def reset(self):
        for det_key in self.detectors:
            self.detectors[det_key].reset()


class BatchEnsembler(BatchDetector, Ensembler):
    def __init__(self, detectors: dict, evaluator: str, constraints: dict = None):
        BatchDetector.__init__(self)
        Ensembler.__init__(self, detectors, evaluator, constraints)

    def update(self, X, y_true, y_pred):
        Ensembler.update(self, X=X, y_true=y_true, y_pred=y_pred)
        BatchDetector.update(self, X=X, y_true=y_true, y_pred=y_pred)

    def reset(self):
        Ensembler.reset(self)
        BatchDetector.reset(self)

    def set_reference(self, X, y_true=None, y_pred=None):
        Ensembler.set_reference(self, X=X, y_true=y_true, y_pred=y_pred)
        BatchDetector.set_reference(self, X=X, y_true=y_true, y_pred=y_pred)

In [10]:
data = make_example_batch_data()
first_batch = pd.DataFrame(data[data.year == 2007].loc[:, 'confidence'])
test_batches = data[data.year.apply(lambda x: True if x in [2008, 2009] else False)]

Unnamed: 0,confidence
0,0.591243
1,0.19609
2,0.504019
3,0.487307
4,0.211284


In [8]:
detectors = {
    'cdbd': CDBD(),
    'kdq': KdqTreeBatch(bootstrap_samples=2),
    'hdddm': HDDDM(),
}

In [None]:
# can index PD numerically and alphabetically, but cannot mix numpy?
constraints = {
    'cdbd': ['confidence'],
    'kdq': [c for c in 'abcdefghij'], 
    'hdddm': data.columns[1:-2]
}

In [9]:
ensemble = BatchEnsembler(detectors, 'eval_scheme_1', constraints)
ensemble.set_reference(X=first_batch)
# for year, subset in test_batches.groupby('year'):
#     batch = pd.DataFrame(subset.loc[:, 'confidence'])
#     ensemble.update(X=batch)


In [25]:
data
datanp = data.to_numpy()
datanp

array([[2007, 5036.441154105885, 198.93668499437987, ..., 2,
        0.5912428959771991, False],
       [2007, 10773.027106856454, 179.26065206202287, ..., 2,
        0.19608980559200695, False],
       [2007, 6173.596855550637, 199.77582944399998, ..., 1,
        0.5040190812111394, False],
       ...,
       [2021, 6928.908423747664, 210.00794306993708, ..., 2,
        0.7877263587429961, True],
       [2021, 6410.2554195579005, 205.09216738311736, ..., 0,
        0.6895377228784094, True],
       [2021, 10751.384620144796, 207.42807587969665, ..., 0,
        0.7453958978487201, True]], dtype=object)

In [31]:
data.iloc[:, 1:-2]
data.columns

Index(['year', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'cat',
       'confidence', 'drift'],
      dtype='object')

In [33]:
data[data.columns[1:-2]]

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,cat
0,5036.441154,198.936685,6995.029198,133187.697950,0.402750,-1.339692,90453.457231,8819.761759,5398.861764,1059.835492,2
1,10773.027107,179.260652,3467.970499,52174.100900,1.019491,-1.145258,72629.507052,10137.233887,7009.344647,1552.888540,2
2,6173.596856,199.775829,4921.472906,101848.950792,-2.248839,1.903668,54839.516587,9603.709193,10579.518133,856.266082,1
3,13208.612929,182.222072,9638.632797,92442.165726,-0.045599,1.106212,84447.758390,12511.305427,5618.433039,451.926312,2
4,2719.272009,213.028454,5501.439179,79155.846985,-1.602719,-1.368130,168769.099013,16034.259622,6214.909756,853.582074,1
...,...,...,...,...,...,...,...,...,...,...,...
299995,8434.682418,203.931217,8674.083920,104317.953759,-1.588765,-1.313684,165132.657078,284.979440,7065.994173,83.803525,2
299996,9701.660570,179.925659,8021.128673,63749.091322,0.412063,0.408188,124630.300901,323.602163,10960.392248,96.796410,1
299997,6928.908424,210.007943,3312.751778,126116.364855,0.113971,2.432577,107318.127090,214.452933,9491.057615,169.644626,2
299998,6410.255420,205.092167,6914.861179,158221.742954,0.060710,0.928108,71610.603448,570.911317,8577.498702,61.404365,0
