In [2]:
pip install "opendp[polars]"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [30]:
## step 1 / nothing changed

import opendp.prelude as dp
dp.enable_features("contrib")

privacy_unit = dp.unit_of(contributions=1)
input_metric, d_in = privacy_unit

## step 2 / nothing changed
privacy_loss = dp.loss_of(epsilon=1.)
privacy_measure, d_out = privacy_loss

## step 3 / nothing changed
col_names = [
   "name", "sex", "age", "maritalStatus", "hasChildren", "highestEducationLevel", 
   "sourceOfStress", "smoker", "optimism", "lifeSatisfaction", "selfEsteem"
]

## Step 4

In [32]:
## the way to read data in changed
#scan csv with the url wasn't working directly so it wasn't able to be read in 
#directly as a lazy frame; however if we could debug this that would be great
#prehaps move this import up but kept it here to keep same style as og page 
import polars as pl
data_url = "https://raw.githubusercontent.com/opendp/opendp/sydney/teacher_survey.csv"
data = pl.read_csv(data_url)
data.columns = col_names
data = pl.LazyFrame(data)

#cast all integers to floats since fill nan can only work on floats
data = data.cast({pl.Int64: pl.Float64})

context = dp.Context.compositor(
    data=data,
    privacy_unit=privacy_unit,
    privacy_loss=privacy_loss,
    split_evenly_over=3, 
    margins={
        ("age", ): dp.Margin(public_info="lengths", max_partition_length=8000, max_num_partitions=1),
        (): dp.Margin(public_info="lengths", max_partition_length=8000, max_num_partitions=1),
    },
)

We set the `max_partition_length` to be 8000 since we know our data has less than 8000 entries. The `max_num_partitions` is set to 1 since the order that our methods are using on the backgrounds requires `max_num_partitions` to be specified and it's 1 since one of our queries require only 1 partition. 

## Step 5

Based on previous examples, one of the initial ways you may try to find the count of age directly could be 

```
context.query().select(pl.col("age").len().dp.noise()
).release().collect() 
```

However, this method won't work yet since in this case we're using `expr.len`, which isn't supported yet by OpenDP and is different than `pl.len` in polars. Therefore, we'll directly get the length of the dataframe by doing `pl.len().dp.noise()`. However, we also need to impose some type of bound, which we can do by filtering the variable of interest. 

In [31]:
count_query = context.query().filter(pl.col("age")>0).select(
    pl.len().dp.noise()
)

#there is no .param() attribute for a lazyframequery
#this next step can probably be done with Mike's new accuracy method 
fake_scale = 3.
accuracy = dp.discrete_laplacian_scale_to_accuracy(scale=fake_scale, alpha=0.05)
print("Accuracy: ",accuracy)

dp_count = count_query.release().collect().select("len").to_series().item()
#i added .select("age").to_series().item() to return just a value but 
print("DP Count: ",dp_count)
interval = (dp_count - accuracy, dp_count + accuracy)
print("Interval: ", interval)

mean_query = context.query().select(
    pl.col("age").fill_null(40.0).fill_nan(40.0).dp.mean((0.0,100.0))
)
dp_mean = mean_query.release().collect().select("age").to_series().item()
print(dp_mean)


Accuracy:  9.445721638273582
DP Count:  6999
Interval:  (6989.554278361727, 7008.445721638273)
37.43891984554948
