In [4]:
from opendp.trans import *
from opendp.meas import *
from opendp.core import *

enable_features("floating-point")

data_path = os.path.join('.', 'data', 'PUMS_california_demographics_1000', 'data.csv')
var_names = ["age", "sex", "educ", "race", "income", "married"]

with open(data_path) as input_data:
    data = input_data.read()

In [5]:
def make_dp_mean(col_names, index, size, bounds, dataset_distance, epsilon):
    """
    Draft of a function to be used on the backend for DPCreator
    :param index: Column index to select data from
    :param bounds: Bounds for clamp
    :param size: Estimated number of values in data
    :param dataset_distance: Max distance between neighboring datasets
    :param epsilon: Privacy budget
    :return:
    """
    preprocessor = (
        # Convert data into Vec<Vec<String>>
        make_split_dataframe(separator=",", col_names=col_names) >>
        # Selects a column of df, Vec<str>
        make_select_column(key=index, TOA=str) >>
        # Cast the column as Vec<Optional<Float>>
        make_cast(TIA=str, TOA=float) >>
        # Impute missing values to 0 Vec<Float>
        make_impute_constant(0.) >>
        # Clamp age values
        make_clamp(bounds) >>
        make_bounded_resize(size, bounds, 0.) >>
        make_sized_bounded_mean(size, bounds)
    )
    return binary_search_chain(
        lambda s: preprocessor >> make_base_laplace(s),
        dataset_distance, epsilon)



In [6]:
epsilon = 1.
column = "age"
dp_mean_meas = make_dp_mean(var_names, column, 1000, (0., 200.), 1, epsilon)
res = dp_mean_meas(data)

print(f"Epsilon: {epsilon}\nColumn: {column}\nDP Mean: {res}")


Epsilon: 1.0
Column: age
DP Mean: 44.6411189291801
