#   Creating Saprk Session entry point


In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder  \
.appName("spark-rdd-collection") \
.master("local")    \
.getOrCreate()


# Create RDD from Collection

In [2]:
data = [
    ("fox", 6), ("dog", 5), ("fox", 3), ("dog", 8),
    ("cat", 1), ("cat", 2), ("cat", 3), ("cat", 4)]

# use SparkContext (sc) as given by PySpark shell create an RDD as rdd

In [3]:
rdd = spark.sparkContext.parallelize(data)

In [4]:
rdd.collect()

[('fox', 6),
 ('dog', 5),
 ('fox', 3),
 ('dog', 8),
 ('cat', 1),
 ('cat', 2),
 ('cat', 3),
 ('cat', 4)]

In [5]:
rdd.count()

8

# Aggregate and Merge Values of Keys
The reduceByKey() transformation is used to merge and aggregate values.

In [6]:
sum_per_key = rdd.reduceByKey(lambda x, y : x+y)
sum_per_key.collect()

[('fox', 9), ('dog', 13), ('cat', 10)]

# Filter RDD's Elements

In [9]:
sum_filtered = sum_per_key.filter(lambda k : k[1] > 9)
sum_filtered.collect() 

[('dog', 13), ('cat', 10)]

# Group Similar Keys

In [10]:
grouped = rdd.groupByKey()
grouped.collect() 

[('fox', <pyspark.resultiterable.ResultIterable at 0x24150a9abe0>),
 ('dog', <pyspark.resultiterable.ResultIterable at 0x24150a9aca0>),
 ('cat', <pyspark.resultiterable.ResultIterable at 0x24150a9ad00>)]

In [11]:
grouped.map(lambda v : (v[0], list(v[1]))).collect()

[('fox', [6, 3]), ('dog', [5, 8]), ('cat', [1, 2, 3, 4])]

# Aggregate Values for Similar Keys
To aggregate and sum up the values for each key, we may use the groupValues() transformation and the sum() function.

In [12]:
aggregated = grouped.mapValues(lambda values : sum(values))
aggregated.collect()

[('fox', 9), ('dog', 13), ('cat', 10)]