In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from tmlt.analytics.session import Session
from tmlt.analytics.keyset import KeySet
from tmlt.analytics.query_builder import QueryBuilder
from tmlt.analytics.privacy_budget import ApproxDPBudget, PureDPBudget
from tmlt.analytics.protected_change import AddOneRow
import random
import numpy as np
from pyspark.sql.functions import rand, floor, col



In [2]:
spark = SparkSession.Builder().getOrCreate()

df = (
    spark.range(1_000_000)
    .withColumn("A", (floor(rand() * 5_000) + 1).cast("int"))
    .withColumn("B", (floor(rand() * 100) + 1).cast("int"))
    .withColumn("C", (floor(rand() * 10) + 1).cast("int"))
    .withColumn("D", (floor(rand() * 10) + 1).cast("int"))
    .drop("id")  # drop the default range column
)


25/10/04 11:48:46 WARN Utils: Your hostname, Bayards-Macbook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.153 instead (on interface en0)
25/10/04 11:48:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/04 11:48:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Approx DP Bug

approx_sess = Session.from_dataframe(
    privacy_budget=ApproxDPBudget(float("inf"),1),
    source_id="simpledata",
    dataframe=df,
    protected_change=AddOneRow(),
)

keyset_query = QueryBuilder("simpledata").get_groups(["A", "B"])
groups = approx_sess.evaluate(keyset_query, ApproxDPBudget(5, 1e-6))

keyset = KeySet.from_dataframe(groups) * KeySet.from_dict({"C":range(1,11)})


25/10/04 11:49:01 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [4]:
# Query Execution 
query=QueryBuilder("simpledata").groupby(keyset).count()
result = approx_sess.evaluate(query, ApproxDPBudget(1, 0.1))
result.show()



+---+---+---+-----+
|  A|  B|  C|count|
+---+---+---+-----+
|  1|  6|  8|    0|
|  1| 11|  7|    1|
|  1| 15|  3|    0|
|  1| 31|  5|    1|
|  1| 31|  8|   -1|
|  1| 36|  2|    3|
|  1| 36| 10|   -1|
|  1| 38|  2|    1|
|  1| 38|  9|    2|
|  1| 51|  2|   -3|
|  1| 59|  6|   -1|
|  1| 59|  9|   -1|
|  1| 92|  6|    1|
|  1| 99|  3|   -1|
|  1| 99|  7|   -1|
|  1| 99|  8|    0|
|  2|  3|  6|    0|
|  2|  8|  2|   -2|
|  2|  8|  9|    1|
|  2| 13|  6|    0|
+---+---+---+-----+
only showing top 20 rows



                                                                                

In [5]:
keyset = KeySet.from_dataframe(df.select(["A"]).distinct())*(KeySet.from_dataframe(df.select(["B"]).distinct())*KeySet.from_dict({"C":range(1,11)}))

In [6]:
# Query Execution 
query=QueryBuilder("simpledata").groupby(keyset).count()
result = approx_sess.evaluate(query, PureDPBudget(1))
result.show()

25/10/04 11:49:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/04 11:49:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/04 11:49:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/04 11:49:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/04 11:49:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/04 11:49:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/04 11:50:18 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/10/04 11:50:18 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% fo

+---+---+---+-----+
|  A|  B|  C|count|
+---+---+---+-----+
|  1|  2|  9|    2|
|  1|  3|  7|    0|
|  1|  4|  3|    1|
|  1|  4|  8|   -1|
|  1|  4|  9|    0|
|  1|  5|  2|   -2|
|  1|  7|  7|    2|
|  1| 10|  1|    1|
|  1| 11|  6|   -2|
|  1| 11|  9|    2|
|  1| 15|  8|   -1|
|  1| 15| 10|   -2|
|  1| 18|  4|    1|
|  1| 19|  2|    0|
|  1| 19|  6|    0|
|  1| 19| 10|    0|
|  1| 20|  6|    0|
|  1| 20|  7|    0|
|  1| 21|  8|    0|
|  1| 23|  5|    1|
+---+---+---+-----+
only showing top 20 rows



25/10/04 11:50:18 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [7]:
groups.show()

+---+---+
|  A|  B|
+---+---+
|  1| 56|
|  1| 64|
|  1| 80|
|  1|132|
|  1|175|
|  1|180|
|  1|255|
|  1|269|
|  1|271|
|  1|352|
|  1|365|
|  1|442|
|  1|622|
|  1|685|
|  1|772|
|  1|780|
|  1|845|
|  1|878|
|  1|881|
|  1|953|
+---+---+
only showing top 20 rows

