## Setting up Spark Session / Context

In [1]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession.builder\
        .master("spark://192.168.2.251:7077") \
        .appName("Lecture1_Example3_RDD_function_examples")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 8)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()


# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/09 16:52:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


---
## 1. Putting data on spark cluster

In [2]:
rdd = spark_context.parallelize(range(1_000_000_00), 16)

---
## 2. Using getNumPartitions

In [3]:
# getNumPartitions
rdd.getNumPartitions()

16

---
## 3. Using map & mapValues:

In [4]:
# map
x = spark_context.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])
def f(x): return len(x)
x.map(f).collect()

                                                                                

[2, 2]

In [5]:
# mapValues
x.mapValues(f).collect()

[('a', 3), ('b', 1)]

---
## 4. Using flapMap & flapMapValues:

In [6]:
# flapMap
rdd_2 = rdd.flatMap(lambda x: (x, x**2, x**3))
rdd_2.take(20)

[0, 0, 0, 1, 1, 1, 2, 4, 8, 3, 9, 27, 4, 16, 64, 5, 25, 125, 6, 36]

In [7]:
x = spark_context.parallelize([("a", ["x", "y", "z"]), ("b", ["p", "r"])])
def f(x): return "".join(x)
x.mapValues(f).collect()

[('a', 'xyz'), ('b', 'pr')]

In [8]:
# flapMapValues
x.flatMapValues(f).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

---
## 5. Using mapPartitions & mapPartitionsWithIndex:

In [9]:
# mapPartitions
def f(iterator): yield sum(iterator)

rdd_2 = rdd.mapPartitions(f)
print(rdd_2.glom().collect())  # glom() flattens elements on the same partition

[[19531246875000], [58593746875000], [97656246875000], [136718746875000], [175781246875000], [214843746875000], [253906246875000], [292968746875000], [332031246875000], [371093746875000], [410156246875000], [449218746875000], [488281246875000], [527343746875000], [566406246875000], [605468746875000]]


In [10]:
# mapPartitionsWithIndex
def f(partitionIndex, iterator): yield (partitionIndex,sum(iterator))
rdd_2 = rdd.mapPartitionsWithIndex(f)
print(rdd_2.glom().collect())  # glom() flattens elements on the same partition

[[(0, 19531246875000)], [(1, 58593746875000)], [(2, 97656246875000)], [(3, 136718746875000)], [(4, 175781246875000)], [(5, 214843746875000)], [(6, 253906246875000)], [(7, 292968746875000)], [(8, 332031246875000)], [(9, 371093746875000)], [(10, 410156246875000)], [(11, 449218746875000)], [(12, 488281246875000)], [(13, 527343746875000)], [(14, 566406246875000)], [(15, 605468746875000)]]


---
## 6. Using filter:

In [11]:
# filter
rdd_2 = rdd.filter(lambda x: x%2 == 1 and x < 20)
rdd_2.collect()

                                                                                

[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]

---
## 7. Using distinct:

In [12]:
# distinct
x = spark_context.parallelize(["A","A","B"])
y = x.distinct()
print(x.collect())
print(y.collect())

['A', 'A', 'B']
['B', 'A']


In [13]:
spark_session.stop()