In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.appName("Transformations")
         .master("local[*]")
         .config("spark.executor.memory", "1g")
         .getOrCreate()
         )

sc = spark.sparkContext

# 1. Map & FlatMap

- map(func): Applies a function to each element and returns a new RDD

In [2]:
rdd = sc.parallelize([1, 2, 3, 4])
rdd_map = rdd.map(lambda x: x * 2)
print(rdd_map.collect())  # [2, 4, 6, 8]

[2, 4, 6, 8]


- flatMap(func): Similar to map, but flattens the result

In [3]:
rdd = sc.parallelize(["hello world", "spark rdd"])
rdd_flatmap = rdd.flatMap(lambda x: x.split(" "))
print(rdd_flatmap.collect())  # ['hello', 'world', 'spark', 'rdd']

['hello', 'world', 'spark', 'rdd']


# 2. Filtering

- filter(func): Returns an RDD with elements that satisfy a condition

In [4]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6])
rdd_filter = rdd.filter(lambda x: x % 2 == 0)
print(rdd_filter.collect())  # [2, 4, 6]

[2, 4, 6]


# 3. Distinct

- distinct(): Removes duplicate elements

In [5]:
rdd = sc.parallelize([1, 2, 2, 3, 3, 4])
rdd_distinct = rdd.distinct()
print(rdd_distinct.collect())  # [1, 2, 3, 4]

[1, 2, 3, 4]


# 4. Set Operations

- union(otherRDD): Merges two RDDs

In [6]:
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([3, 4, 5])
rdd_union = rdd1.union(rdd2)
print(rdd_union.collect())  # [1, 2, 3, 3, 4, 5]

[1, 2, 3, 3, 4, 5]


- intersection(otherRDD): Returns common elements

In [7]:
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([3, 4, 5])
rdd_intersection = rdd1.intersection(rdd2)
print(rdd_intersection.collect())  # [3]

[3]


- subtract(otherRDD): Returns elements in RDD1 but not in RDD2

In [8]:
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([3, 4, 5])
rdd_subtract = rdd1.subtract(rdd2)
print(rdd_subtract.collect())  # [1, 2]

[1, 2]


# 5. Grouping

- groupByKey(func): Groups elements based on a function

In [9]:
rdd = sc.parallelize([("a", 1), ("b", 2), ("a", 3)])
rdd_group = rdd.groupByKey().mapValues(list)
print(rdd_group.collect())  # [('a', [1, 3]), ('b', [2])]

[('b', [2]), ('a', [1, 3])]


- reduceByKey(func): Merges values for each key

In [10]:
rdd = sc.parallelize([("a", 1), ("b", 2), ("a", 3)])
rdd_reduce = rdd.reduceByKey(lambda x, y: x + y)
print(rdd_reduce.collect())  # [('a', 4), ('b', 2)]

[('b', 2), ('a', 4)]


- aggregateByKey(zeroValue, seqFunc, combFunc): More customizable reduction

In [11]:
rdd = sc.parallelize([("a", 1), ("b", 2), ("a", 3), ("b", 4)])

result = rdd.aggregateByKey(
    (0, 0),  # (Sum, Count)
    lambda acc, value: (acc[0] + value, acc[1] + 1),  # In the same partition
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])  # Across partitions
)

print(result.collect())

[('b', (6, 2)), ('a', (4, 2))]


# 6. Sorting

- sortBy(func, ascending=True): Sorts elements based on a function

In [12]:
rdd = sc.parallelize([5, 1, 3, 2, 4])
sorted_rdd = rdd.sortBy(lambda x: x, ascending=False)
print(sorted_rdd.collect())  # [5, 4, 3, 2, 1]

[5, 4, 3, 2, 1]
