In [1]:
import polars

from qif_micro import model
from qif_micro import qif

_ = polars.Config.set_engine_affinity("streaming")

Consider the following dataset as an example:

In [2]:
dataset = {
    0: [
        {"payee": "Red Rooster", "description": "Chicken burger", "amount": 7.00},
        {"payee": "Clinic", "description": "Fertility treatment", "amount": 150.00}
    ],
    1: [{"payee": "Aldi", "description": "Groceries", "amount": 20.00}],
    2: [
        {"payee": "Uber", "description": "23 minutes trip", "amount": 20.00},
        {"payee": "Lakeside", "description": "One night", "amount": 50.00},
        {"payee": "Clinic", "description": "Skin cancer treatment", "amount": 150.00}
    ],
    3: [{"payee": "Uber", "description": "13 minutes trip", "amount": 15.00}],
    4: [{"payee": "Uber", "description": "15 minutes trip", "amount": 15.00}]
}

count_records = lambda i: len(dataset[i])
payees = [e["payee"] for r in dataset.values() for e in r]
descrs = [e["description"] for r in dataset.values() for e in r]
amounts = [e["amount"] for r in dataset.values() for e in r]

dataset = polars.DataFrame({
    "uid": [i for i in dataset for _ in range(count_records(i))],
    "payee": payees,
    "description": descrs,
    "amount": amounts
})

dataset

uid,payee,description,amount
i64,str,str,f64
0,"""Red Rooster""","""Chicken burger""",7.0
0,"""Clinic""","""Fertility treatment""",150.0
1,"""Aldi""","""Groceries""",20.0
2,"""Uber""","""23 minutes trip""",20.0
2,"""Lakeside""","""One night""",50.0
2,"""Clinic""","""Skin cancer treatment""",150.0
3,"""Uber""","""13 minutes trip""",15.0
4,"""Uber""","""15 minutes trip""",15.0


Instead of sharing the raw microdata, 
we are interested in sharing the total amount spent and the number of transactions made by each customer:

In [3]:
sum_expr = polars.col("amount").sum()
count_expr = polars.col("amount").len()
dataset_agg = dataset.group_by("uid").agg(sum=sum_expr, count=count_expr).sort(by="uid")
dataset_agg

uid,sum,count
i64,f64,u32
0,157.0,2
1,20.0,1
2,220.0,3
3,15.0,1
4,15.0,1


We then assume that the adversary knows the exact amount of one of a target's transactions,
and they want to infer the aggregated statistics of this target.

In [4]:
owner_attr = "uid"
count_attr = "count"
sum_attr = "sum"

prior, ch = model.agg_count_sum.build(dataset_agg, owner_attr, count_attr, sum_attr)

The adversary's knowledge about the target's total amount spent and number of transactions,
upon observing the aggregated data, is

In [5]:
prior.dist.sort(by=[count_attr, sum_attr]).collect()

count,sum,p
u32,f64,f64
1,15.0,0.4
1,20.0,0.2
2,157.0,0.2
3,220.0,0.2


And the channel that models the correlation between each aggregated record and the possible auxiliary information is

In [6]:
def as_matrix(dist: polars.LazyFrame, count_attr, sum_attr, qid_attr) -> polars.LazyFrame:
    return (
        dist
        .collect()
        .sort(qid_attr)
        .pivot(on=qid_attr, index=[count_attr, sum_attr])
        .fill_null(0.0)
        .with_columns(**{count_attr: polars.col(count_attr).cast(int)})
        .sort(by=[count_attr, sum_attr])
    )

as_matrix(ch.dist, count_attr, sum_attr, "qid")

count,sum,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0,…,184.0,185.0,186.0,187.0,188.0,189.0,190.0,191.0,192.0,193.0,194.0,195.0,196.0,197.0,198.0,199.0,200.0,201.0,202.0,203.0,204.0,205.0,206.0,207.0,208.0,209.0,210.0,211.0,212.0,213.0,214.0,215.0,216.0,217.0,218.0,219.0,220.0
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,157.0,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,0.006329,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,220.0,0.009009,0.008968,0.008927,0.008887,0.008846,0.008805,0.008764,0.008724,0.008683,0.008642,0.008601,0.008561,0.00852,0.008479,0.008438,0.008398,0.008357,0.008316,0.008275,0.008234,0.008194,0.008153,0.008112,0.008071,0.008031,0.00799,0.007949,0.007908,0.007868,0.007827,0.007786,0.007745,0.007705,0.007664,0.007623,…,0.001508,0.001468,0.001427,0.001386,0.001345,0.001304,0.001264,0.001223,0.001182,0.001141,0.001101,0.00106,0.001019,0.000978,0.000938,0.000897,0.000856,0.000815,0.000775,0.000734,0.000693,0.000652,0.000611,0.000571,0.00053,0.000489,0.000448,0.000408,0.000367,0.000326,0.000285,0.000245,0.000204,0.000163,0.000122,8.2e-05,4.1e-05


From the adversary's knowledge and the model above, 
we can get a joint distribution as follows:

In [7]:
joint = qif.push(prior, ch)
as_matrix(joint.dist, count_attr, sum_attr, "qid")

count,sum,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0,…,184.0,185.0,186.0,187.0,188.0,189.0,190.0,191.0,192.0,193.0,194.0,195.0,196.0,197.0,198.0,199.0,200.0,201.0,202.0,203.0,204.0,205.0,206.0,207.0,208.0,209.0,210.0,211.0,212.0,213.0,214.0,215.0,216.0,217.0,218.0,219.0,220.0
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,157.0,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,0.001266,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,220.0,0.001802,0.001794,0.001785,0.001777,0.001769,0.001761,0.001753,0.001745,0.001737,0.001728,0.00172,0.001712,0.001704,0.001696,0.001688,0.00168,0.001671,0.001663,0.001655,0.001647,0.001639,0.001631,0.001622,0.001614,0.001606,0.001598,0.00159,0.001582,0.001574,0.001565,0.001557,0.001549,0.001541,0.001533,0.001525,…,0.000302,0.000294,0.000285,0.000277,0.000269,0.000261,0.000253,0.000245,0.000236,0.000228,0.00022,0.000212,0.000204,0.000196,0.000188,0.000179,0.000171,0.000163,0.000155,0.000147,0.000139,0.00013,0.000122,0.000114,0.000106,9.8e-05,9e-05,8.2e-05,7.3e-05,6.5e-05,5.7e-05,4.9e-05,4.1e-05,3.3e-05,2.4e-05,1.6e-05,8e-06


Now, the above is the joint correlation from the adversary's point of view,
but we have the actual data,
so we can construct a better correlation,
which we will then use to evaluate how well the adversary performs.

In [8]:
prior_baseline, ch_baseline = model.agg_count_sum.baseline(dataset, owner_attr, count_attr, sum_attr, "amount")
baseline = qif.push(prior_baseline, ch_baseline)
as_matrix(baseline.dist, count_attr, sum_attr, "amount")

count,sum,7.0,15.0,20.0,50.0,150.0
i64,f64,f64,f64,f64,f64,f64
1,15.0,0.0,0.4,0.0,0.0,0.0
1,20.0,0.0,0.0,0.2,0.0,0.0
2,157.0,0.1,0.0,0.0,0.0,0.1
3,220.0,0.0,0.0,0.066667,0.066667,0.066667


Notice that, despite the fact that from the adversary's point of view lots of amount values seemed possible,
most of them in practice would never be part of the adversary's auxiliary information.
With this we can compute the adversary's expected chance of success as

In [9]:
f"{qif.posterior(prior, ch, baseline) * 100}%"

'76.66666666666666%'

Had we decided to share the raw microdata, the adversary's chance of success would be

In [10]:
f"{qif.posterior(prior_baseline, ch_baseline, baseline) * 100}%"

'86.66666666666666%'

Notice that what impacts the adversary's expected chance of success is the case when the auxiliary information is 7.00,
since, instead of inferring that the target's record is (count = 2, sum = 157.00),
the adversary is tricked into guessing (count = 3, sum = 220.00).