# Partitioning Strategies:  RDDs and DataFrames


Taken from https://medium.com/parrot-prediction/partitioning-in-apache-spark-8134ad840b0

In [84]:
nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rdd = sc.parallelize(nums)
    
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

Number of partitions: 8
Partitioner: None
Partitions structure: [[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]


In [85]:
sc.getConf().getAll()

[('spark.driver.port', '57143'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.id', 'local-1575928739377'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '10.20.0.195'),
 ('spark.app.name', 'PySparkShell'),
 ('spark.ui.showConsoleProgress', 'true')]

In [87]:
rdd = sc.parallelize(nums, 2)
    
print("Default parallelism: {}".format(sc.defaultParallelism))
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

Default parallelism: 8
Number of partitions: 2
Partitioner: None
Partitions structure: [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]


In [83]:
rdd.count()

10

In [88]:
rdd = sc.parallelize(nums, 15)

print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

Number of partitions: 15
Partitioner: None
Partitions structure: [[], [0], [1], [], [2], [3], [], [4], [5], [], [6], [7], [], [8], [9]]


Because partitionBy() requires data to be in key/value format we will need to transform the data.

In [94]:
rdd = sc.parallelize(nums) \
        .map(lambda el: (el, el)) \
        .partitionBy(5) \
        .persist()
    
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

j=0
for i in rdd.glom().collect():
    j+=1
    print("partition: " + str(j) + " "+ str(i))

Number of partitions: 5
Partitioner: <pyspark.rdd.Partitioner object at 0x11e848ef0>
Partitions structure: [[(0, 0), (5, 5)], [(1, 1), (6, 6)], [(2, 2), (7, 7)], [(3, 3), (8, 8)], [(4, 4), (9, 9)]]
partition: 1 [(0, 0), (5, 5)]
partition: 2 [(1, 1), (6, 6)]
partition: 3 [(2, 2), (7, 7)]
partition: 4 [(3, 3), (8, 8)]
partition: 5 [(4, 4), (9, 9)]


You can see that now the elements are distributed differently. A few interesting things happened:
- `parallelize(nums)` - we are transforming Python array into RDD with no partitioning scheme,
- `map(lambda el: (el, el))` - transforming data into the form of a tuple,
- `partitionBy(2)` - splitting data into 2 chunks using default *hash partitioner*

Explicit assignment of partition locations makes the hashing strategy more apparent.  The use of the `%` function assigns it to the correct partition.

In [103]:
from pyspark.rdd import portable_hash
num_partitions = 2
for el in nums:
    print("Element: [{}]: {} % {} = partition {}".format(
        el, portable_hash(el), num_partitions, portable_hash(el) % num_partitions))

Element: [0]: 0 % 2 = partition 0
Element: [1]: 1 % 2 = partition 1
Element: [2]: 2 % 2 = partition 0
Element: [3]: 3 % 2 = partition 1
Element: [4]: 4 % 2 = partition 0
Element: [5]: 5 % 2 = partition 1
Element: [6]: 6 % 2 = partition 0
Element: [7]: 7 % 2 = partition 1
Element: [8]: 8 % 2 = partition 0
Element: [9]: 9 % 2 = partition 1


4596069200710135518

In [105]:
transactions = [
    {'name': 'Bob', 'amount': 100, 'country': 'United Kingdom'},
    {'name': 'James', 'amount': 15, 'country': 'United Kingdom'},
    {'name': 'Marek', 'amount': 51, 'country': 'Poland'},
    {'name': 'Johannes', 'amount': 200, 'country': 'Germany'},
    {'name': 'Paul', 'amount': 75, 'country': 'Poland'},
]

We know that further analysis will be performed analyzing many similar records within the same country. To optimize network traffic it seems to be a good idea to put records from one country in one node.
To meet this requirement, we will need a custom partitioner:
Custom partitioner — function returning an integer for given object (tuple key).

In [106]:
# Dummy implementation assuring that data for each country is in one partition
def country_partitioner(country):
    return hash(country)
    #return portable_hash(country)
    

# Validate results
num_partitions = 5
print(country_partitioner("Poland") % num_partitions)
print(country_partitioner("Germany") % num_partitions)
print(country_partitioner("United Kingdom") % num_partitions)

0
0
1


In [108]:
rdd = sc.parallelize(transactions) \
        .map(lambda el: (el['country'], el)) \
        .partitionBy(5, country_partitioner)
    
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

print("\n--\n")
j=0
for i in rdd.glom().collect():
    j+=1
    print("\npartition: " + str(j) + "\n"+ str(i))

Number of partitions: 5
Partitioner: <pyspark.rdd.Partitioner object at 0x11ebadd30>
Partitions structure: [[('Poland', {'country': 'Poland', 'amount': 51, 'name': 'Marek'}), ('Germany', {'country': 'Germany', 'amount': 200, 'name': 'Johannes'}), ('Poland', {'country': 'Poland', 'amount': 75, 'name': 'Paul'})], [('United Kingdom', {'country': 'United Kingdom', 'amount': 100, 'name': 'Bob'}), ('United Kingdom', {'country': 'United Kingdom', 'amount': 15, 'name': 'James'})], [], [], []]

--


partition: 1
[('Poland', {'country': 'Poland', 'amount': 51, 'name': 'Marek'}), ('Germany', {'country': 'Germany', 'amount': 200, 'name': 'Johannes'}), ('Poland', {'country': 'Poland', 'amount': 75, 'name': 'Paul'})]

partition: 2
[('United Kingdom', {'country': 'United Kingdom', 'amount': 100, 'name': 'Bob'}), ('United Kingdom', {'country': 'United Kingdom', 'amount': 15, 'name': 'James'})]

partition: 3
[]

partition: 4
[]

partition: 5
[]


Using the partitioning scheme, we can now carry out calculations:
(We could have also used `forEachPartition`)

In [109]:
def sum_sales(iterator):
    yield sum(transaction[1]['amount'] for transaction in iterator)

In [110]:
by_country = sc.parallelize(transactions) \
        .map(lambda el: (el['country'], el)) \
        .partitionBy(3, country_partitioner)
    
print("Partitions structure: {}".format(by_country.glom().collect()))

# Sum sales in each partition
sum_amounts = by_country \
    .mapPartitions(sum_sales) \
    .collect()

print("Total sales for each partition: {}".format(sum_amounts))

Partitions structure: [[('Germany', {'country': 'Germany', 'amount': 200, 'name': 'Johannes'})], [], [('United Kingdom', {'country': 'United Kingdom', 'amount': 100, 'name': 'Bob'}), ('United Kingdom', {'country': 'United Kingdom', 'amount': 15, 'name': 'James'}), ('Poland', {'country': 'Poland', 'amount': 51, 'name': 'Marek'}), ('Poland', {'country': 'Poland', 'amount': 75, 'name': 'Paul'})]]
Total sales for each partition: [200, 0, 241]


# Working with DataFrames


In [113]:
from pyspark import Row

rdd = spark.sparkContext \
        .parallelize(transactions) \
        .map(lambda x: Row(**x))
    
df = spark.createDataFrame(rdd)

print("Number of partitions: {}".format(df.rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(df.rdd.glom().collect()))


j=0; 
for i in df.rdd.glom().collect():
    j+=1
    print("partition: " + str(j) + "\n"+ str(i))



# Repartition by column
df2 = df.repartition(50,"country")

print("\nAfter 'repartition()'")
print("Number of partitions: {}".format(df2.rdd.getNumPartitions()))
print("Partitioner: {}".format(df2.rdd.partitioner))
print("Partitions structure: {}".format(df2.rdd.glom().collect()))


df2.show()

Number of partitions: 8
Partitioner: None
Partitions structure: [[], [Row(amount=100, country='United Kingdom', name='Bob')], [], [Row(amount=15, country='United Kingdom', name='James')], [Row(amount=51, country='Poland', name='Marek')], [], [Row(amount=200, country='Germany', name='Johannes')], [Row(amount=75, country='Poland', name='Paul')]]
partition: 1
[]
partition: 2
[Row(amount=100, country='United Kingdom', name='Bob')]
partition: 3
[]
partition: 4
[Row(amount=15, country='United Kingdom', name='James')]
partition: 5
[Row(amount=51, country='Poland', name='Marek')]
partition: 6
[]
partition: 7
[Row(amount=200, country='Germany', name='Johannes')]
partition: 8
[Row(amount=75, country='Poland', name='Paul')]

After 'repartition()'
Number of partitions: 50
Partitioner: None
Partitions structure: [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(amount=200, country='Germany', name='Johannes')], [], [Row(amount=51, country='Poland', name='M

# `coalesce()` and `repartition()`

If you are increasing the number of partitions use repartition()(performing full shuffle),
if you are decreasing the number of partitions use coalesce() (minimizes shuffles)

In [114]:
nums_rdd = spark.sparkContext \
        .parallelize(nums) \
        .map(lambda x: Row(x))
    
nums_df = spark.createDataFrame(nums_rdd, ['num'])

print("Number of partitions: {}".format(nums_df.rdd.getNumPartitions()))
print("Partitions structure: {}".format(nums_df.rdd.glom().collect()))

nums_df = nums_df.repartition(4)

print("Number of partitions: {}".format(nums_df.rdd.getNumPartitions()))
print("Partitions structure: {}".format(nums_df.rdd.glom().collect()))

Number of partitions: 8
Partitions structure: [[Row(num=0)], [Row(num=1)], [Row(num=2)], [Row(num=3), Row(num=4)], [Row(num=5)], [Row(num=6)], [Row(num=7)], [Row(num=8), Row(num=9)]]
Number of partitions: 4
Partitions structure: [[Row(num=4), Row(num=9)], [], [], [Row(num=0), Row(num=1), Row(num=2), Row(num=3), Row(num=5), Row(num=6), Row(num=7), Row(num=8)]]


Operations that benefit from partitioning
All operations performing shuffling data by key will benefit from partitioning. 

Some examples are 
- `cogroup()`, 
- `groupWith()`, 
- `join()`, 
- `leftOuterJoin()`, 
- `rightOuterJoin()`, 
- `groupByKey()`, 
- `reduceByKey()`, 
- `combineByKey()`
- `lookup()`.


Partitioning strategy is not always preserved under transformations

In [115]:
rdd = sc.parallelize(nums) \
        .map(lambda el: (el, el)) \
        .partitionBy(2) \
        .persist()
    
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

# Transform with `map()` 
rdd2 = rdd.map(lambda el: (el[0], el[0]*2))


print("Number of partitions: {}".format(rdd2.getNumPartitions()))
print("Partitioner: {}".format(rdd2.partitioner))  # We have lost a partitioner
print("Partitions structure: {}".format(rdd2.glom().collect()))


Number of partitions: 2
Partitioner: <pyspark.rdd.Partitioner object at 0x11c2d4b38>
Partitions structure: [[(0, 0), (2, 2), (4, 4), (6, 6), (8, 8)], [(1, 1), (3, 3), (5, 5), (7, 7), (9, 9)]]
Number of partitions: 2
Partitioner: None
Partitions structure: [[(0, 0), (2, 4), (4, 8), (6, 12), (8, 16)], [(1, 2), (3, 6), (5, 10), (7, 14), (9, 18)]]


Instead, there are some functions provided that guarantee that each tuple’s key remains the same — `mapValues()`, `flatMapValues()` or `filter()` (if the parent has a partitioner).

In [75]:
rdd = sc.parallelize(nums) \
        .map(lambda el: (el, el)) \
        .partitionBy(2) \
        .persist()
    
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

# Use `mapValues()` instead of `map()` 
rdd2 = rdd.mapValues(lambda x: x * 2)

print("Number of partitions: {}".format(rdd2.getNumPartitions()))
print("Partitioner: {}".format(rdd2.partitioner))  # We still got partitioner
print("Partitions structure: {}".format(rdd2.glom().collect()))

Number of partitions: 2
Partitioner: <pyspark.rdd.Partitioner object at 0x11bebd710>
Partitions structure: [[(0, 0), (2, 2), (4, 4), (6, 6), (8, 8)], [(1, 1), (3, 3), (5, 5), (7, 7), (9, 9)]]
Number of partitions: 2
Partitioner: <pyspark.rdd.Partitioner object at 0x11bebd710>
Partitions structure: [[(0, 0), (2, 4), (4, 8), (6, 12), (8, 16)], [(1, 2), (3, 6), (5, 10), (7, 14), (9, 18)]]
