# Spark aggregation

In [1]:
from pyspark.sql  import SparkSession
spark=SparkSession.builder  \
    .appName("spark-agg") \
    .master("local")    \
    .getOrCreate()

In [2]:
tuples = [('A', 7), ('A', 8), ('A', -4),
          ('B', 3), ('B', 9), ('B', -1),
          ('C', 1), ('C', 5)]
rdd = spark.sparkContext.parallelize(tuples)

In [3]:
# drop negative values
positives = rdd.filter(lambda x: x[1] > 0)
positives.collect()

[('A', 7), ('A', 8), ('B', 3), ('B', 9), ('C', 1), ('C', 5)]

The groupByKey() transformation can cause out of disk problems as data is sent over the network of Spark servers and collected on the reduce workers. 

When the number values per key are in the thousands or millions, there might be an OOM (running out of memory) error.


In [7]:
# find sum and avergae per key using groupByKey()
sum_and_avg = positives.groupByKey() \
    .mapValues(lambda v: (sum(v), float(sum(v))/len(v)))
print(sum_and_avg)

PythonRDD[19] at RDD at PythonRDD.scala:53


In [6]:
# find sum and average per key using reduceByKey()
# 1. create (sum, count) per key
sum_count = positives.mapValues(lambda v: (v, 1))

# 2. aggregate (sum, count) per key
sum_count_agg = sum_count.reduceByKey(lambda x, y:
     (x[0]+y[0], x[1]+y[1]))
     
# 3. finalize sum and average per key
sum_and_avg = sum_count_agg.mapValues(
    lambda v: (v[0], float(v[0])/v[1]))