## Learned about Partition in the <a href="https://luminousmen.com/post/spark-partitions">link</a>.
Highly reccommand you to read this code with the blog link above.

In [30]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession \
    .builder \
    .appName("Understanding partition.") \
    .getOrCreate()

In [10]:
import pandas as pd
import numpy as np

length = 10f

# Select each element in Lists.
names = np.random.choice(['Bob', 'James', 'Marek', 'Johannes', None], length)
amounts = np.random.randint(0, 1000000, length)
country = np.random.choice(
    ['United Kingdom', 'Poland', 'USA', 'Germany', None], 
    length
)
# Create a DataFrame with Pandas.
df = pd.DataFrame({'name': names, 'amount': amounts, 'country': country})

# Create a DataFrame with given Pandas DF.
transactions = spark.createDataFrame(df)

In [24]:
print('Number of partitions: {}'.format(transactions.rdd.getNumPartitions()))
print('Partitioner: {}'.format(transactions.rdd.partitioner))
print('Partitions structure: {}'.format(transactions.rdd.glom().collect()))

Number of partitions: 8
Partitioner: None
Partitions structure: [[Row(name=None, amount=673409, country='United Kingdom')], [Row(name='James', amount=844027, country='Poland')], [Row(name='Bob', amount=477544, country=None)], [Row(name='Marek', amount=716152, country='United Kingdom'), Row(name='Bob', amount=813080, country='United Kingdom')], [Row(name=None, amount=508249, country=None)], [Row(name='Johannes', amount=945121, country='Poland')], [Row(name='James', amount=66436, country='USA')], [Row(name='Johannes', amount=771230, country=None), Row(name=None, amount=905639, country=None)]]


# Repartitioning 
Network cost will be increased!

In [57]:
repartitioned = transactions.repartition(16)
print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions()))
print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect()))

Number of partitions: 16
Partitions structure: [[], [], [], [], [], [], [], [], [], [], [], [], [Row(name=None, amount=673409, country='United Kingdom'), Row(name='James', amount=844027, country='Poland'), Row(name='Bob', amount=477544, country=None), Row(name='Marek', amount=716152, country='United Kingdom'), Row(name=None, amount=508249, country=None), Row(name='Johannes', amount=945121, country='Poland'), Row(name='James', amount=66436, country='USA'), Row(name='Johannes', amount=771230, country=None)], [Row(name='Bob', amount=813080, country='United Kingdom'), Row(name=None, amount=905639, country=None)], [], []]


## Repartitioning by column

In [59]:
repartitioned = transactions.repartition('country')
print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions()))
print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect()))

Number of partitions: 200
Partitions structure: [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(name='Bob', amount=477544, country=None), Row(name=None, amount=508249, country=None), Row(name='Johannes', amount=771230, country=None), Row(name=None, amount=905639, country=None)], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(name='James', amount=66436, country='USA')], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(name='James', amoun

## Coalesce

In [60]:
coalesced = transactions.coalesce(2)
print('Number of partitions: {}'.format(coalesced.rdd.getNumPartitions()))
print('Partitions structure: {}'.format(coalesced.rdd.glom().collect()))

Number of partitions: 2
Partitions structure: [[Row(name=None, amount=673409, country='United Kingdom'), Row(name='James', amount=844027, country='Poland'), Row(name='Bob', amount=477544, country=None), Row(name='Marek', amount=716152, country='United Kingdom'), Row(name='Bob', amount=813080, country='United Kingdom')], [Row(name=None, amount=508249, country=None), Row(name='Johannes', amount=945121, country='Poland'), Row(name='James', amount=66436, country='USA'), Row(name='Johannes', amount=771230, country=None), Row(name=None, amount=905639, country=None)]]


# [Reference] 
##### Dataframe.rdd.glom (<a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.glom.html#pyspark.RDD.glom">link<a>)
Return an RDD created by coalescing all elements within each partition into a list.

##### Spark.parallelize (<a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.SparkContext.parallelize.html">link</a>)
Distribute a local Python collection to form an RDD. Using range is recommended if the input represents a range for performance.

In [48]:
spark.sparkContext.parallelize([0, 2, 3, 4, 6], 5).glom().collect()

[[0], [2], [3], [4], [6]]

In [49]:
spark.sparkContext.parallelize([0, 3, 6], 5).glom().collect()

[[], [0], [], [3], [6]]