# Differential Privacy (DP) Private Aggregate Seeded Synthesizer

In [1]:
import pandas as pd

from pacsynth import init_logger, set_number_of_threads
from pacsynth import Dataset
from pacsynth import DpAggregateSeededParametersBuilder, AccuracyMode, FabricationMode
from pacsynth import DpAggregateSeededSynthesizer, Dataset

from utils import gen_data_frame

## Generating an example data frame with random data

In [2]:
number_of_records_to_generate = 6000

sensitive_df = gen_data_frame(number_of_records_to_generate)

## Creating the sensitive dataset

In [3]:
sensitive_dataset = Dataset.from_data_frame(sensitive_df)

## Generating the synthetic data

In [4]:
reporting_length = 4

builder = DpAggregateSeededParametersBuilder() \
    .reporting_length(reporting_length) \
    .epsilon(4.0) \
    .accuracy_mode(AccuracyMode.prioritize_long_combinations()) \
    .fabrication_mode(FabricationMode.uncontrolled()) \
    .use_synthetic_counts(True)

synth = DpAggregateSeededSynthesizer(builder.build())

synth.fit(sensitive_dataset)

synthetic_raw_data = synth.sample(synth.get_dp_number_of_records())
synthetic_dataset = Dataset(synthetic_raw_data)

synthetic_df = Dataset.raw_data_to_data_frame(synthetic_raw_data)

## Generating/exporting aggregate data

This illustrates how to generate aggregates directly from the sensitive and synthetic data, as well as how to access the DP aggregates.

In [5]:
sensitive_aggregates = sensitive_dataset.get_aggregates(reporting_length, ';')

dp_aggregates = synth.get_dp_aggregates(';')

synthetic_aggregates = synthetic_dataset.get_aggregates(reporting_length, ';')

## Evaluating

In [6]:
sensitive_df.replace('', '0').astype('int').describe()

Unnamed: 0,H1,H2,H3,H4,H5,H6,H7,H8,H9,H10
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.9795,2.635167,4.551,0.49,0.497167,0.480333,0.495833,0.4915,0.509,0.492667
std,0.809643,2.132249,3.306088,0.499942,0.500034,0.499655,0.500024,0.499969,0.499961,0.499988
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,2.0,5.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,2.0,6.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
synthetic_df.replace('', '0').astype('int').describe()

Unnamed: 0,H1,H2,H3,H4,H5,H6,H7,H8,H9,H10
count,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0
mean,0.934576,2.435159,4.19444,0.468121,0.481938,0.465956,0.480439,0.464125,0.480772,0.470285
std,0.814382,2.174861,3.411331,0.499024,0.499715,0.498881,0.499659,0.498753,0.499672,0.499158
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,4.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,2.0,6.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
