In [1]:
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()

In [2]:
from sample_agg_blb import SampleAggregate

In [3]:
from pyspark.sql.types import FloatType, BooleanType

filepath = "../../../dp-test-datasets/data/PUMS_california_demographics/data.csv"
pums = spark.read.load(filepath, format="csv", sep=",",inferSchema="true", header="true")

pums = pums.withColumnRenamed("_c0", "PersonID")

pums = pums.withColumn("income", col("income").cast(FloatType()))
pums = pums.withColumn("latino", col("latino").cast(BooleanType()))
pums = pums.withColumn("black", col("black").cast(BooleanType()))
pums = pums.withColumn("asian", col("asian").cast(BooleanType()))
pums = pums.withColumn("married", col("married").cast(BooleanType()))

sa = SampleAggregate(pums)
print(sa.parts)

12239


In [4]:
cols = ["sex", "married", "income", "age"] # needs to be a superset of keys
keys = ["sex", "married"]
groups = [(0, True),(0, False), (1, True), (1, False)]

In [5]:
sa.sample(cols)

In [11]:
sa.aggregate(["income", "age"], keys, groups) # cols here doesn't need to include keys.  this bootstraps

In [12]:
def mean_income(data):
    cols = list(zip(*list(data)))
    income = cols[0]
    weights = cols[-1]
    weighted_sum = np.sum([i * w for i, a, w in data])
    return float(weighted_sum / np.sum(weights))

def count(data):
    cols = list(zip(*list(data)))
    weights = cols[-1]
    return int(np.sum(weights))

def mean_age(data):
    cols = list(zip(*list(data)))
    age = cols[1]
    weights = cols[-1]
    weighted_sum = np.sum([a * w for i, a, w in data])
    return float(weighted_sum / np.sum(weights))



In [14]:
sa.apply([mean_income, mean_age, count])
sa.applied.toDF().show(6, False)

+----------+------------------------------------------------+
|group     |val                                             |
+----------+------------------------------------------------+
|[0, true] |[51058.70303378147, 52.18651987639935, 344982]  |
|[0, false]|[39287.49521914742, 37.935139052061984, 233222] |
|[1, true] |[17472.706624496237, 49.002191397463996, 345898]|
|[1, false]|[18887.60595551702, 47.816666110907335, 299890] |
|[0, true] |[41068.753245853964, 48.65728576032215, 375479] |
|[0, false]|[22265.36096159232, 47.719686477320245, 260651] |
+----------+------------------------------------------------+
only showing top 6 rows



In [15]:
import math

def mean_estimator(data, eps, delta, lam, parts):    
    if lam is not None:
        data = [-lam if d < -lam else lam if d > lam else d for d in data]
    sd = (2 * lam * math.sqrt(2 * math.log(1.25 / delta)))/(parts*eps)
    np.random.seed()
    noise = np.random.normal(0, sd)
    theta = float(np.nanmean(data))
    return theta + noise
    
def median_estimator(data, eps, delta, lam, parts):
    # not private
    return float(np.median(data))

In [None]:
eps = 1.0
delta = 1E-9
sens = [100000, 65, 350000]

est = sa.estimate(mean_estimator, eps, delta, sens)

#est = sa.estimate(median_estimator, eps, delta, sens)


est.toDF().show(5, False)