# Differential Privacy (DP) Private Aggregate Seeded Synthesizer


> Example based on [Synthetic Data Showcase - _pac-synth_](https://github.com/microsoft/synthetic-data-showcase/blob/main/packages/lib-pacsynth/samples/dp_aggregate_seeded_short_example.ipynb).


In [1]:
from snsynth.aggregate_seeded import (
    AggregateSeededSynthesizer,
    AccuracyMode,
    FabricationMode,
    AggregateSeededDataset,
)
from snsynth.transform.table import NoTransformer

from utils import gen_data_frame


## Generating an example data frame with random data


In [2]:
number_of_records_to_generate = 6000

sensitive_df = gen_data_frame(number_of_records_to_generate)


## Generating the synthetic data


In [3]:
reporting_length = 4

synth = AggregateSeededSynthesizer(
    reporting_length=reporting_length,
    epsilon=4.0,
    accuracy_mode=AccuracyMode.prioritize_long_combinations(),
    fabrication_mode=FabricationMode.uncontrolled(),
    use_synthetic_counts=True,
)

synth.fit(sensitive_df, transformer=NoTransformer())

synthetic_df = synth.sample(synth.get_dp_number_of_records())


## Generating/exporting aggregate data

This illustrates how to generate aggregates directly from the sensitive and synthetic data, as well as how to access the DP aggregates.


In [4]:
sensitive_aggregates = synth.get_sensitive_aggregates(";")

dp_aggregates = synth.get_dp_aggregates(";")

synthetic_aggregates = AggregateSeededDataset.from_data_frame(
    synthetic_df
).get_aggregates(reporting_length, ";")


## Evaluating


In [5]:
sensitive_df.replace("", "0").astype("int").describe()


Unnamed: 0,H1,H2,H3,H4,H5,H6,H7,H8,H9,H10
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,1.001333,2.645167,4.596,0.5005,0.493833,0.503333,0.505,0.506167,0.506667,0.4925
std,0.819417,2.103568,3.322994,0.500041,0.500004,0.500031,0.500017,0.500004,0.499997,0.499985
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,3.0,5.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
75%,2.0,4.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,2.0,6.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
synthetic_df.replace("", "0").astype("int").describe()


Unnamed: 0,H1,H2,H3,H4,H5,H6,H7,H8,H9,H10
count,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0
mean,0.934415,2.418593,4.207903,0.475652,0.464502,0.48467,0.481226,0.480571,0.482866,0.461223
std,0.823738,2.152071,3.444694,0.499448,0.498779,0.499806,0.499688,0.499663,0.499747,0.498535
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,4.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,2.0,6.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
