In [None]:
pip install opendp



## OpenDP Programming Framework Demo

import the OpenDP library. Enable the "honest-but-curious" and "contrib" flags:
- **Honest-but-Curious**: We will require a looser trust model, as we cannot verify any privacy or stability properties of user-defined functions (exercise 2).
- **Contrib**: include mechanisms which have not yet been fully-vetted

In [None]:
import opendp.prelude as dp
import pandas as pd
import numpy as np
dp.enable_features("honest-but-curious", "contrib")

# Read in the dataset
# We will look at income data from the California PUMS dataset
data = dp.examples.get_california_pums_path().read_text()

# the greatest number of records that any one individual can influence in the dataset
max_influence = 1

# establish public information
col_names = ["age", "sex", "educ", "race", "income", "married"]

# we can also reasonably intuit that age and income will be numeric,
# as well as bounds for them, without looking at the data
age_bounds = (0, 100)
income_bounds = (0, 150_000)


### Exercise 1: Computing a private variance

In this exercise, you will compute a DP variance over the age column. See [`then_variance()`](https://docs.opendp.org/en/stable/api/python/opendp.transformations.html#opendp.transformations.then_variance) and [`make_variance()`](https://docs.opendp.org/en/stable/api/python/opendp.transformations.html#opendp.transformations.make_variance) in the OpenDP documentation. We will give you the code for releasing a private count since the variance transformation requires an input domain with a known (bounded) dataset size.

In [None]:
age_preprocessor = (
    # Convert data into a dataframe where columns are of type Vec<str>
    dp.t.make_split_dataframe(separator=",", col_names=col_names) >>
    # Selects a column of df, Vec<str>
    dp.t.make_select_column(key="age", TOA=str) >>
    dp.t.then_cast_default(TOA=float) >>
    # Clamp income values
    dp.t.then_clamp(bounds=tuple(map(float, age_bounds)))
)

dp_count_measurement = age_preprocessor >> dp.t.then_count() >> dp.m.then_laplace(1.)
count_release = dp_count_measurement(data)
print(count_release)

999


  dp.t.make_split_dataframe(separator=",", col_names=col_names) >>
  dp.t.make_select_column(key="age", TOA=str) >>


Use the DP count above as input to a DP variance measurement. You can use `dp.binary_search_chain` to find the right scale so that your DP variance is $\varepsilon = 1$ differentially private.

In [None]:
variance_transformation = age_preprocessor >> dp.t.then_resize(size=count_release, constant=0.0) >> dp.t.then_variance()

dp_variance = dp.binary_search_chain(
    lambda s: variance_transformation >> dp.m.then_laplace(scale=s),
    d_in=max_influence,
    d_out=1.
)

print(dp_variance(data))

287.5584437694537


## Exercise 2: Create a user-defined transformation

Create a user-defined transformation `make_trimmed` that removes the smallest $\alpha$ fraction of elements and the largest $\alpha$ fraction of elements from the dataset, based on their positions after sorting.

In [None]:
def make_trimmed(alpha, n):
    """Constructs a Transformation that trims the bottom alpha and top (1 - alpha) percentiles from the dataset"""
    def function(arg: list[int]) -> list[int]:
        arg = np.sort(arg)
        l_idx = int(np.round(alpha * len(arg)))
        u_idx = int(np.round((1 - alpha) * len(arg)))
        return [arg[i] for i in range(l_idx,u_idx)]

    def stability_map(d_in: int) -> int:
        # We showed in section that this is a 1-stable transformation
        return d_in

    return dp.t.make_user_transformation(
        input_domain=dp.vector_domain(dp.atom_domain(T=float), size=n),
        input_metric=dp.symmetric_distance(),
        output_domain=dp.vector_domain(dp.atom_domain(T=float), size = int(((1 - 2*alpha)*n))),
        output_metric=dp.symmetric_distance(),
        function=function,
        stability_map=stability_map,
    )

trim_transformation = (
    (dp.vector_domain(dp.atom_domain(T=float), size=len(data)), dp.symmetric_distance())
    >> dp.t.then_cast_default(TOA=float)
    >> make_trimmed(alpha=0.05, n=len(data))
)

age_data = age_preprocessor(data)
trimmed_data = trim_transformation(age_data)
print(trimmed_data[:10])

[20.0, 20.0, 20.0, 20.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0]


## Exercise 3: Create a DP Trimmed Mean Measurment

Using your `make_trimmed` transformation, compute a DP trimmed mean.

In [None]:
alpha = 0.05
epsilon = 1.0
n = count_release
lower, upper = (0.0, 100.0)

scale = (upper - lower) / (.9 * n * epsilon)
dp_trimmed_mean = (
    (dp.vector_domain(dp.atom_domain(T=float), size=count_release), dp.symmetric_distance())
    >> dp.t.then_cast_default(TOA=float)
    >> make_trimmed(alpha=0.05, n=count_release)
    >> dp.t.then_clamp((lower, upper))
    >> dp.t.then_mean()
    >> dp.m.then_laplace(scale=scale)
)

age_data = age_preprocessor(data)
print(dp_trimmed_mean(age_data))

44.2446493266399
