# DP Aggregate Seeded Synthesizer

In [15]:
import pandas as pd

from sdssynth import init_logger, set_number_of_threads
from sdssynth import Dataset
from sdssynth import DpAggregateSeededParametersBuilder, AccuracyMode, FabricationMode
from sdssynth import DpAggregatedSeededSynthesizer, Dataset

from utils import gen_dataset

## Generating an example data frame with random data

In [16]:
number_of_records_to_generate = 6000

sensitive_df = gen_dataset(number_of_records_to_generate)

## Creating the sensitive dataset

In [17]:
sensitive_dataset = Dataset.from_data_frame(sensitive_df)

## Generating the synthetic data

In [18]:
reporting_length = 4

builder = DpAggregateSeededParametersBuilder() \
    .reporting_length(reporting_length) \
    .epsilon(0.9) \
    .accuracy_mode(AccuracyMode.prioritize_large_counts()) \
    .fabrication_mode(FabricationMode.uncontrolled()) \
    .use_synthetic_counts(True)

synth = DpAggregatedSeededSynthesizer(builder.build())

synth.fit(sensitive_dataset)

synthetic_raw_data = synth.sample(len(sensitive_df))
synthetic_dataset = Dataset(synthetic_raw_data)

synthetic_df = Dataset.raw_data_to_data_frame(synthetic_raw_data)

## Generating/exporting aggregate data

In [19]:
sensitive_aggregates = sensitive_dataset.get_aggregates(reporting_length, ';')

dp_aggregates = synth.get_dp_aggregates(';')

synthetic_aggregates = synthetic_dataset.get_aggregates(reporting_length, ';')

## Evaluating

In [20]:
sensitive_df.replace('', '0').astype('int').describe()

Unnamed: 0,H1,H2,H3,H4,H5,H6,H7,H8,H9,H10
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.993167,2.616,4.578833,0.492833,0.504167,0.497333,0.499333,0.51,0.494667,0.494833
std,0.815821,2.111624,3.327656,0.49999,0.500024,0.500035,0.500041,0.499942,0.500013,0.500015
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,2.5,5.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,2.0,4.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,2.0,6.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
synthetic_df.replace('', '0').astype('int').describe()

Unnamed: 0,H1,H2,H3,H4,H5,H6,H7,H8,H9,H10
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.911667,2.323667,3.896333,0.475833,0.494667,0.478333,0.464333,0.485833,0.481333,0.483333
std,0.832654,2.144296,3.487398,0.499457,0.500013,0.499572,0.498768,0.499841,0.499693,0.499764
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,4.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,2.0,6.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
