In [28]:
from opendp.trans import *
from opendp.meas import *
from opendp.core import *

# establish data information
data_path = os.path.join('.', 'data', 'PUMS_california_demographics_1000', 'data.csv')
var_names = ["age", "sex", "educ", "race", "income", "married", "pid"]

# TODO: Remove column headers
with open(data_path) as input_data:
    data = input_data.read()

In [29]:
def make_dp_mean(index, lower, upper, n, epsilon):
    """
    Draft of a function to be used on the backend for DPCreator
    :param index: Column index to select data from
    :param data_type: Type to cast data to
    :param lower: Lower bound for clamp
    :param upper: Upper bound for clamp
    :param n: Estimated number of values in data
    :param epsilon: Privacy budget
    :return:
    """
    preprocessor = (
        # Convert data into Vec<Vec<String>>
        make_split_dataframe(separator=",", col_names=[0, 1, 2, 3, 4]) >>
        # Selects a column of df, Vec<str>
        make_select_column(key=index, T=str) >>
        # Cast the column as Vec<Int>
        make_cast(TI=str, TO=float) >>
        # Impute missing values to 0
        make_impute_constant(0.) >>
        # Clamp age values
        make_clamp(lower, upper)
    )

    # TOOO: chain these into one process. Currently getting domain mismatch error
    res = preprocessor(data)
    sigma = (upper - lower) / (n * epsilon)
    mean_process = make_bounded_mean(lower, upper, n=n, T=float) >> make_base_laplace(sigma)
    return mean_process(res)

In [35]:
epsilon = 1.
res = make_dp_mean(0, 20., 50., 1000, epsilon)
print(f"Epsilon: {epsilon}\nDP Mean: {res}")

Epsilon: 1.0
DP Mean: 39.71689548355615
