In [1]:
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()

In [2]:
from sample_agg_blb import SampleAggregate

In [3]:
from pyspark.sql.types import FloatType, BooleanType

filepath = "../../../dp-test-datasets/data/PUMS_california_demographics/data.csv"
pums = spark.read.load(filepath, format="csv", sep=",",inferSchema="true", header="true")

pums = pums.withColumnRenamed("_c0", "PersonID")

pums = pums.withColumn("income", col("income").cast(FloatType()))
pums = pums.withColumn("latino", col("latino").cast(BooleanType()))
pums = pums.withColumn("black", col("black").cast(BooleanType()))
pums = pums.withColumn("asian", col("asian").cast(BooleanType()))
pums = pums.withColumn("married", col("married").cast(BooleanType()))

sa = SampleAggregate(pums, 1000)
print(sa.parts)

1000


In [4]:
cols = ["sex", "married", "income", "age"] # needs to be a superset of keys
keys = ["sex", "married"]
groups = [(0, True),(0, False), (1, True), (1, False)]

In [5]:
sa.sample(cols)

In [6]:
sa.aggregate(["income"], keys, groups) # cols here doesn't need to include keys.  this bootstraps

In [7]:
def mean(data):
    cols = list(zip(*list(data)))
    income = cols[0]
    weights = cols[1]
    weighted_sum = np.sum([i * w for i, w in data])
    return float(weighted_sum / np.sum(weights))

def count(data):
    cols = list(zip(*list(data)))
    income = cols[0]
    weights = cols[1]
    return int(np.sum(weights))

In [8]:
sa.apply([mean, count])
sa.applied.toDF().show(6, False)

+----------+----------------------------+
|group     |val                         |
+----------+----------------------------+
|[0, true] |[47855.65197935947, 331193] |
|[0, false]|[26887.162495183664, 249152]|
|[1, true] |[17646.977309873768, 338341]|
|[1, false]|[24184.752956050652, 305306]|
|[0, true] |[49830.71719000795, 320855] |
|[0, false]|[31794.825233509087, 234038]|
+----------+----------------------------+
only showing top 6 rows



In [51]:
import math

def mean_estimator(data, eps, delta, lam, parts):    
    if lam is not None:
        data = [-lam if d < -lam else lam if d > lam else d for d in data]
    sd = (2 * lam * math.sqrt(2 * math.log(1.25 / delta)))/(parts*eps)
    np.random.seed()
    noise = np.random.normal(0, sd)
    theta = float(np.nanmean(data))
    return theta + noise
    
def median_estimator(data, eps, delta, lam, parts):
    # not private
    return float(np.median(data))

In [53]:
eps = 1.0
delta = 1E-9
sens = [100000, 350000]

est = sa.estimate(mean_estimator, eps, delta, sens)

#est = sa.estimate(median_estimator, eps, delta, sens)


est.toDF().show(5, False)

+----------+---------------------------------------+
|group     |val                                    |
+----------+---------------------------------------+
|[0, true] |[49732.92159408669, 342798.41428865964]|
|[0, false]|[27071.639451164112, 260483.5959718357]|
|[1, true] |[19739.93315391425, 330146.53035164456]|
|[1, false]|[23307.541272480736, 290003.6046781233]|
+----------+---------------------------------------+

