In [1]:
import opendp.prelude as dp
import polars as pl 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 

dp.enable_features("contrib")
df = pl.scan_csv("sample_FR_LFS.csv", infer_schema_length=1000, ignore_errors=True)

#same compositor that we defined earlier in part 1
context = dp.Context.compositor(
    data=df,
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=10,
    margins={
        ("SEX", ): dp.Margin(public_info="keys", max_partition_length=200_000),
        ("AGE", ): dp.Margin(public_info="keys", max_partition_length=200_000),
        ("ILOSTAT", ): dp.Margin(public_info="keys", max_partition_length=200_000),
        ("HWUSUAL", ): dp.Margin(public_info="keys", max_partition_length=200_000),
        ("YEAR", ): dp.Margin(public_info="keys", max_partition_length=200_000, max_partition_contributions=4),
        ("QUARTER", ): dp.Margin(public_info="keys", max_partition_length=200_000, max_partition_contributions=13),
        ("YEAR", "QUARTER",): dp.Margin(public_info="keys", max_partition_length=200_000, max_partition_contributions=1),
        (): dp.Margin(max_partition_length=60_000_000),
    },
)

OpenDPException: 
  FFI("No match for concrete type i64. You've got a debug binary! Debug binaries support fewer types. Consult https://docs.opendp.org/en/stable/contributing/development-environment.html#build-opendp")

This is the same example that we did in the grouping by multiple variables. When we started out, we looked at the actual and DP variables and the numbers seem somewhat close but how close are they really? 

In [None]:
#compute dp values
q = (context.query().group_by(["YEAR","QUARTER"]).agg(pl.len().dp.noise().alias("DP"))).sort(["YEAR","QUARTER"])
mgb_dp_values = q.release().collect()
#compute actual values
mgb_tr_values = (df.group_by(["YEAR","QUARTER"]).agg(pl.len().alias("Actual"))).sort(["YEAR","QUARTER"]).collect()
#get merged df
yq_df = mgb_dp_values.join(mgb_tr_values, on=['YEAR', 'QUARTER'])
print(yq_df)

yq_df.sort(by=['YEAR', 'QUARTER'])
labels = [f"{year} Q{quarter}" for year, quarter in zip(yq_df['YEAR'], yq_df['QUARTER'])]
x = np.arange(len(labels))  

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - 0.3/2, yq_df['DP'], 0.3, label='DP')
bars2 = ax.bar(x + 0.3/2, yq_df['Actual'], 0.3, label='Actual')

ax.set_xlabel('Year and Quarter')
ax.set_ylabel('Length')
ax.set_title('Comparison of Counts by Year and Quarter')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.show()

In a lot of ways, the DP values appear to be higher or lower than the actual. One evaluation metric we can use to get further clarity is the `accuracy` method in polars. 

In [None]:
q.accuracy()

In [None]:
lfq.accuracy(0.05)

In [None]:
# from opendp.accuracy import describe_polars_measurement_accuracy
from accuracy import describe_polars_measurement_accuracy

In [None]:
describe_polars_measurement_accuracy

In [None]:
from npolars import LazyFrameQuery
# from opendp.accuracy import describe_polars_measurement_accuracy

# lfq.accuracy(0.05)

data = pl.LazyFrame([pl.Series("convicted", [0, 1, 1, 0, 1] * 50, dtype=pl.Int32)])

context = dp.Context.compositor(
                 data=data,
                 privacy_unit=dp.unit_of(contributions=1),
                 privacy_loss=dp.loss_of(epsilon=1.0),
                 split_evenly_over=1,
                 margins={(): dp.Margin(max_partition_length=1000)},
            )

query = context.query().select(
                 pl.len().dp.noise(), 
                 pl.col("convicted").fill_null(0).dp.sum((0, 1))
             )

lfq = LazyFrameQuery(query.__getattribute__, query)
lfq.accuracy(alpha=.05)  # type: ignore[union-attr]

In [None]:
query.__getattribute__

In [None]:
def interpret_accuracy(distribution, scale, accuracy, alpha):
                 return (
                     f"When the {distribution} scale is {scale}, "
                     f"the DP estimate differs from the true value by no more than {accuracy} "
                     f"at a statistical significance level alpha of {alpha}, "
                     f"or with (1 - {alpha})100% = {(1 - alpha) * 100}% confidence."
                 )

interpret_accuracy("Integer Laplace", 2.0, 6.429605, alpha=.05) # doctest:+SKIP

In [None]:
q.__getattribute__