In [1]:
import opendp.prelude as dp
dp.enable_features("contrib", "rust-stack-trace")
import numpy as np
from mbi import LinearMeasurement, Domain as MBIDomain, estimation

In [13]:
np.random.seed(42)
n_rows = 10000
data = np.zeros((n_rows, 5), dtype=int)

cardinalities = [3, 4, 5, 7, 11]

data[:, 0] = np.random.randint(0, cardinalities[0], size=n_rows)  # Values 0-2
data[:, 1] = np.random.randint(0, cardinalities[1], size=n_rows)  # Values 0-3
data[:, 2] = np.random.randint(0, cardinalities[2], size=n_rows)  # Values 0-4
data[:, 3] = np.random.randint(0, cardinalities[3], size=n_rows)  # Values 0-6
data[:, 4] = np.random.randint(0, cardinalities[4], size=n_rows)  # Values 0-10

queries = [
        [0, 1, 4],
        [2, 3, 4],
        [0, 1, 2],
        [1, 2],
        [1, 3]
    ]
weights = [0.7, 0.7, 0.3, 0.2, 0.1]

releases = [
    LinearMeasurement(
        noisy_measurement=np.bincount(data[:, i], minlength=cardinalities[i]),
        clique=(i,),
    )
    for i in range(len(cardinalities))
]

m_aim = dp.synthetic.make_ordinal_aim(
    dp.numpy.array2_domain(cardinalities=cardinalities, T=int),
    dp.symmetric_distance(),
    dp.zero_concentrated_divergence(),
    d_in=1,
    d_out=0.5,
    releases=releases,
    queries=queries,
    weights=weights
)

noisy_releases = m_aim(data)

mbi_domain = MBIDomain(
    attributes=tuple(range(len(cardinalities))),
    shape=cardinalities
)

model = estimation.mirror_descent(
    mbi_domain,
    noisy_releases,
    iters=100,
    callback_fn=lambda *_: None
)

synthetic_data = model.synthetic_data(10000)

import pandas as pd
from itertools import product

# print(type(synthetic_data))
# print(dir(synthetic_data))

synthetic_df = synthetic_data.df
original_df = pd.DataFrame(data, columns=range(data.shape[1]))

# print("Synthetic Data Sample:")
# print(synthetic_df.head())

# print("Original Data Sample:")
# print(original_df.head())
weights = [0.7, 0.7, 0.3, 0.2, 0.1]
for query, weight in zip(queries, weights):
    true_counts = original_df.groupby(query).size().div(len(original_df)) # gets the count divided by rows (i.e. probability) for each combo of things in the query columns (groupby)
    aim_counts = synthetic_df.groupby(query).size().div(len(synthetic_df))
    
    all_keys = list(product(*[range(cardinalities[i]) for i in query]))
    orig = true_counts.reindex(all_keys, fill_value=0) # reindex according to all possible cardinalities, fill the others w/ 0s so doesn't affect error
    synth = aim_counts.reindex(all_keys, fill_value=0)

    l1_distance = (orig - synth).abs().sum()
    l1_per_bin = l1_distance / np.prod([cardinalities[i] for i in query])
    print(f"Query {query} with weight {weight}: L1 distance per bin = {l1_per_bin:.4f}")

iterating
Privacy used so far: rho = 0.0062499999999999995
iterating
Privacy used so far: rho = 0.03125
iterating
Privacy used so far: rho = 0.13125
iterating
Privacy used so far: rho = 0.23125
iterating
Privacy used so far: rho = 0.33125000000000004
iterating
Privacy budget nearly exhausted (up to rounding error). Exiting the loop.
Total privacy budget expended: rho = 0.42125000000000007
Query [0, 1, 4] with weight 0.7: L1 distance per bin = 0.0007
Query [2, 3, 4] with weight 0.7: L1 distance per bin = 0.0003
Query [0, 1, 2] with weight 0.3: L1 distance per bin = 0.0005
Query [1, 2] with weight 0.2: L1 distance per bin = 0.0009
Query [1, 3] with weight 0.1: L1 distance per bin = 0.0016


In [None]:

# created class to put this all together; just need queries + weights + cardinalities

class AIMEval:
    def __init__(self, queries, weights, cardinalities, d_in=1.0, d_out=0.5, iters=100):
        assert len(queries) == len(weights)
        self.queries = queries
        self.weights = weights
        self.cardinalities = cardinalities
        self.d_in = d_in
        self.d_out = d_out
        self.iters = iters

    def make_data(self, num_rows):
        data = np.zeros((num_rows, len(self.cardinalities)), dtype=int)
        for col in range(len(self.cardinalities)):
            data[:, col] = np.random.randint(0, self.cardinalities[col], size=num_rows)
        return data

    def make_releases(self, data):
        releases = [
            LinearMeasurement(
                noisy_measurement=np.bincount(data[:, i], minlength=self.cardinalities[i]),
                clique=(i,)
            )
            for i in range(len(self.cardinalities))
        ]
        return releases

    def make_noisy_releases(self, data, releases):
        m_aim = dp.synthetic.make_ordinal_aim(
            dp.numpy.array2_domain(cardinalities=self.cardinalities, T=int),
            dp.symmetric_distance(),
            dp.zero_concentrated_divergence(),
            d_in=self.d_in,
            d_out=self.d_out,
            releases=releases,
            queries=self.queries,
            weights=self.weights
        )
        return m_aim(data)

    def make_synthetic_data(self, noisy_releases, num_rows):
        domain = MBIDomain(
            attributes=tuple(range(len(self.cardinalities))),
            shape=self.cardinalities
        )
        model = estimation.mirror_descent(
            domain,
            noisy_releases,
            iters=self.iters,
            callback_fn=lambda *_: None
        )
        return model.synthetic_data(num_rows)

    def get_error_per_query(self, synth_df, true_df):
        num_rows_true = len(true_df)
        num_rows_synth = len(synth_df)
        l1_distances_per_bin = {}

        for query, weight in zip(self.queries, self.weights):
            true_counts = true_df.groupby(query).size().div(num_rows_true)
            synth_counts = synth_df.groupby(query).size().div(num_rows_synth)

            all_keys = list(product(*[range(self.cardinalities[i]) for i in query]))
            true_counts = true_counts.reindex(all_keys, fill_value=0)
            synth_counts = synth_counts.reindex(all_keys, fill_value=0)

            l1_distance = (true_counts - synth_counts).abs().sum()
            l1_per_bin = l1_distance / np.prod([self.cardinalities[i] for i in query])
            print(f"Query {query} with weight {weight}: L1 distance per bin = {l1_per_bin:.4f}")
            l1_distances_per_bin[tuple(query)] = l1_per_bin

        return l1_distances_per_bin
    
    def evaluate(self, num_rows):
        data = self.make_data(num_rows)
        releases = self.make_releases(data)
        noisy_releases = self.make_noisy_releases(data, releases)
        synth_data = self.make_synthetic_data(noisy_releases, num_rows)

        synth_df = synth_data.df
        true_df = data.df
        errors = self.get_error_per_query(synth_df, true_df)
        return synth_df, errors
    

In [None]:
# queries + weights (None) + cardinalities

np.random.seed(13)

# one query with all columns 
# each col queried separately
# random combos

