In [1]:
from pyspark import SparkConf
from pyspark import SparkContext 
from pyspark.sql import SparkSession

master = "local[*]"
app_name = "Data Partition and Parallel Search Demo"
spark_conf = SparkConf().setMaster(master).setAppName(app_name)

spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

In [2]:
list_students = [(8,"Adele"), (22, "Bob"), (16, "Clement"), (23, "Dave"), 
                 (11, "Ed"), (25, "Fung"), (3, "Goel"), (17, "Harry"), 
                 (14, "Irene"), (2, "Joanna"), (6, "Kelly"), (20, "Lim"), 
                 (1, "Meng"), (5, "Noor"), (19, "Omar")]
num_of_partitions = 3

def udf(splitIndex, iterator): yield f"Partition No: {splitIndex}, Data: {[y for y in iterator]}"

# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.mapPartitionsWithIndex
def print_partitions(rdd): 
    for y in rdd.mapPartitionsWithIndex(udf).collect():
        print(y)

In [3]:
# Random Equal Partitioning

random_equal_paritioned_rdd = sc.parallelize(list_students, num_of_partitions)
print_partitions(random_equal_paritioned_rdd)

Partition No: 0, Data: [(8, 'Adele'), (22, 'Bob'), (16, 'Clement'), (23, 'Dave'), (11, 'Ed')]
Partition No: 1, Data: [(25, 'Fung'), (3, 'Goel'), (17, 'Harry'), (14, 'Irene'), (2, 'Joanna')]
Partition No: 2, Data: [(6, 'Kelly'), (20, 'Lim'), (1, 'Meng'), (5, 'Noor'), (19, 'Omar')]


In [4]:
# Hash Data Partitioning

# Sum of two digits
# Example : hash_function(12) produces 3 i.e. 2 + 1
def hash_function(key):
    total = 0
    for digit in str(key):
        total += int(digit)
    return total

hash_partitioned_rdd = random_equal_paritioned_rdd.partitionBy(num_of_partitions, hash_function)
print_partitions(hash_partitioned_rdd)

Partition No: 0, Data: [(3, 'Goel'), (6, 'Kelly')]
Partition No: 1, Data: [(22, 'Bob'), (16, 'Clement'), (25, 'Fung'), (1, 'Meng'), (19, 'Omar')]
Partition No: 2, Data: [(8, 'Adele'), (23, 'Dave'), (11, 'Ed'), (17, 'Harry'), (14, 'Irene'), (2, 'Joanna'), (20, 'Lim'), (5, 'Noor')]


In [5]:
# Range Data Partitioning in the lab.

In [6]:
# Parallel Search
filtered_rdd = random_equal_paritioned_rdd.filter(lambda x: x[0] == 6)
print_partitions(filtered_rdd)

Partition No: 0, Data: []
Partition No: 1, Data: []
Partition No: 2, Data: [(6, 'Kelly')]


In [7]:
filtered_rdd = hash_partitioned_rdd.filter(lambda x: x[0] == 6)
print_partitions(filtered_rdd)

Partition No: 0, Data: [(6, 'Kelly')]
Partition No: 1, Data: []
Partition No: 2, Data: []
