In [1]:
import burdock
# Placeholder name for our proposed implementation 
# of OpenDP(in design)

# Executing burdock modules

In [2]:
import json

from burdock.client import get_execution_client
execution_client = get_execution_client()

In [4]:
params = {"dataset_name": "example",
          "column_name": "a",
          "budget": .2}
uri = "modules/psi-count-module"

burdock.validate(params=params, uri=uri)  # ADDED analyst machine
response = execution_client.submit(params=params, uri=uri)
report = json.loads(response.result)
# answer is 1, to avoid privacy loss the interval 
# and noise applied is high

{'release': 0.1329770995690811,
 'variable': 'a',
 'accuracy': 29.957322735539908,
 'epsilon': 0.2,
 'interval': [-28.957322735539908, 30.957322735539908]}

In [None]:
# Internally validates
report = burdock.count(dataset_name="example",
                       column_name="a",
                       budget=.2)

# Example top level module code
## The first cell is custom logic for the module

In [18]:
from abc import ABCMeta, abstractmethod
import math
import logging

from burdock.mechanisms.laplace import evaluate

logger = logging.getLogger(__name__)


class Statistic(object):

    __metaclass__ = ABCMeta

    def __init__(self, release, variable, accuracy, epsilon, interval):
        self._release = release
        self._variable = variable
        self._accuracy = accuracy
        self._epsilon = epsilon
        self._interval = interval

    @property
    def release(self):
        return self._release

    @property
    def variable(self):
        return self._variable

    @property
    def accuracy(self):
        return self._accuracy

    @property
    def epsilon(self):
        return self._epsilon

    @property
    def interval(self):
        return self._interval

    def as_dict(self):
        return {"release": self._release,
                "variable": self._variable,
                "accuracy": self._accuracy,
                "epsilon": self._epsilon,
                "interval": self._interval}


class Computer(object):

    __metaclass__ = ABCMeta

    @abstractmethod
    def release(self, dataset):
        pass

    @staticmethod
    def get_subclasses():
        subclasses = Computer.__subclasses__()
        # Return dictionary with key value pairs from subclass name to subclass
        return {subclass.__name__.lower(): subclass for subclass in subclasses}


class CountResult(Statistic):
    def __init__(self, count_release, variable, accuracy, epsilon, interval):
        super(CountResult, self).__init__(count_release, variable, accuracy, epsilon, interval)
        self._count_release = count_release


class Count(Computer):
    def __init__(self, column, epsilon):
        self._column = column
        self._epsilon = epsilon

    def _compute_accuracy(self, epsilon, stability=None, delta=10^-6, alpha=0.05):
        if stability:
            return 2 * math.log(2 / (alpha * delta)) / epsilon
        else:
            return 2 * math.log(1 / alpha) / epsilon

    def release(self, dataset):
        # get the column count
        num_obs = dataset.shape[0]
        # obfuscate the count
        sens = 2
        noise = evaluate(sens, self._epsilon)
        count_release = num_obs + noise
        # calculate accuracy from epsilon
        accuracy = self._compute_accuracy(self._epsilon)
        accuracy_bound = accuracy * num_obs
        mci = [num_obs - accuracy_bound, num_obs + accuracy_bound]
        return CountResult(count_release, self._column, accuracy, self._epsilon, mci)

## The second cell is the entrypoint script

In [19]:
# Module code ran through ES 

import mlflow
import json
import sys

#  from statistic import Count # static was copied above to run in the notebook
from burdock.client import get_dataset_client
from burdock.data.adapters import load_dataset


dataset_name = "example"
column_name = "a"
budget = 1


with mlflow.start_run():
    df = load_dataset(get_dataset_client().read(dataset_name, budget))
    statistic = Count(column_name, budget).release(df)

    with open("result.json", "w") as stream:
        json.dump(statistic.as_dict(), stream)
    mlflow.log_artifact("result.json")
    
# Added to showcase what is stored in result.json
statistic.as_dict()

{'release': 1.432349772599651,
 'variable': 'a',
 'accuracy': 5.991464547107982,
 'epsilon': 1,
 'interval': [-4.991464547107982, 6.991464547107982]}