In [11]:
import pyspark
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()

 Load in CSV and randomly assign rows to a partition

In [12]:
filepath = "data/PUMS_california_demographics_1000/data.csv"
df5 = spark.read.load(filepath, format="csv", sep=","
                    ,inferSchema="true", header="true")
num_partitions = 25
df5 = df5.withColumn("random",rand())
df5 = df5.repartitionByRange(num_partitions, "random")

In [13]:
df5.show()

+---+---+----+----+------+-------+--------------------+
|age|sex|educ|race|income|married|              random|
+---+---+----+----+------+-------+--------------------+
| 58|  0|  13|   1| 60900|      1| 0.02260264856177685|
| 46|  1|  13|   1| 47000|      1| 0.03187375684198912|
| 40|  0|  13|   1|149000|      0|0.003939459108912358|
| 23|  0|  11|   1| 66000|      0|0.023179592655977355|
| 82|  1|   9|   3|   690|      0| 0.03316615433891279|
| 48|  1|  12|   1|  6800|      0|0.023148339591709144|
| 66|  0|  16|   1|120000|      1| 0.01246869426324404|
| 28|  1|   2|   3|     0|      1| 0.03519417148590831|
| 59|  0|  13|   1| 30000|      1|0.005068340765637114|
| 47|  1|  13|   1| 75800|      0|0.004785087357865314|
| 29|  0|   9|   1| 40000|      1| 0.02093951166607999|
| 46|  0|   9|   1|  9500|      0|0.034263355225832015|
| 78|  1|  13|   4| 29300|      1|0.019628224499870894|
| 38|  0|  12|   3| 40000|      1|1.402972422714921E-4|
| 30|  0|   8|   1| 50000|      1|0.009609894149

For each partition, run the estimator, and collect the results

In [20]:

def mean(partition):
    psum = 0
    rows = 0
    for row in partition: 
        psum = psum + row[0]
        rows = rows + 1
    yield (psum/rows )
    
    
def weightedMean(values, weights):
    valSum = 0
    wtSum = 0;
    for i in range(len(values)):
        valSum = valSum + (values[i] * weights[i]);
        wtSum = wtSum + weights[i]
    return valSum/wtSum
    
    
#Bootstrap parameters
N = df5.count()  # size of dataset for generating weights
r = 50           # number of bootstrap runs for each partition
weightedEstimator = weightedMean  # function to calc QOI in bootstrap

def sample_and_aggregate_BLB(partition):
    # get array of values from partition
    pList = list(partition)
    pLen = len(pList)
    vals = [None] * pLen   
    for i in range(pLen):
         # fill array with [0] element of Row      
        vals[i] = pList[i][0]
        
    # sample multinomial to get array of weights       
    weights = np.random.multinomial(N, [1/pLen]*pLen, size=r)
    
    #get the QOI for each run
    results = [None] * r
    for i in range(r):     
        results[i] = weightedEstimator(vals, weights[i])
    # return the mean of the result for each run
    yield np.mean(results)
   
    
means = df5.select("age").rdd.mapPartitions(sample_and_aggregate_BLB).collect()


Compare mean of bootstrap means to the mean of the real data

In [21]:
print(np.mean(means))
print(df5.select(avg("age")).collect())


44.7960488
[Row(avg(age)=44.797)]
