In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext("local", "first app")
#print(sc)

In [2]:
# Read files using a | delimiter
mydata=sc.textFile("hdfs://quickstart.cloudera:8020/data/spark/products/products.csv").map(lambda x: x.split('|'))
mydata.take(2)

[[u'5', u'skirts', u'2019-10-07', u'$85.03', u'2'],
 [u'1', u'skirts', u'2020-06-18', u'$79.17', u'9']]

In [3]:
print("Number of partitions: {}".format(mydata.getNumPartitions()))
print("Partitioner: {}".format(mydata.partitioner))
print("Partition Sizes: {}".format(mydata.glom().map(len).collect()))
print("Partitions structure: {}".format(mydata.glom().collect()))

Number of partitions: 1
Partitioner: None
Partition Sizes: [100]
Partitions structure: [[[u'5', u'skirts', u'2019-10-07', u'$85.03', u'2'], [u'1', u'skirts', u'2020-06-18', u'$79.17', u'9'], [u'3', u'pants', u'2020-03-18', u'$33.63', u'4'], [u'4', u'ties', u'2020-01-03', u'$20.47', u'9'], [u'3', u'pants', u'2020-10-06', u'$7.36', u'1'], [u'5', u'ties', u'2020-10-22', u'$26.19', u'2'], [u'2', u'shirts', u'2019-11-06', u'$63.90', u'4'], [u'3', u'shirts', u'2020-01-28', u'$11.93', u'4'], [u'4', u'skirts', u'2021-03-18', u'$15.81', u'2'], [u'5', u'skirts', u'2020-01-01', u'$46.78', u'4'], [u'4', u'shirts', u'2019-07-20', u'$90.89', u'9'], [u'4', u'pants', u'2019-06-28', u'$41.24', u'6'], [u'3', u'pants', u'2019-11-11', u'$30.10', u'8'], [u'3', u'ties', u'2021-02-23', u'$9.55', u'6'], [u'2', u'shirts', u'2021-05-04', u'$3.35', u'8'], [u'3', u'shirts', u'2020-07-27', u'$88.63', u'5'], [u'5', u'pants', u'2019-10-28', u'$39.57', u'10'], [u'5', u'ties', u'2020-10-30', u'$83.10', u'1'], [u'2', u

In [4]:
# Create a new RDD with Key-value pairs
pairRDD = mydata.map(lambda x :(x[0],x[1:]))
pairRDD.take(2)

[(u'5', [u'skirts', u'2019-10-07', u'$85.03', u'2']),
 (u'1', [u'skirts', u'2020-06-18', u'$79.17', u'9'])]

In [5]:
# check partition lengths
print("Number of partitions: {}".format(pairRDD.getNumPartitions()))
print("Partitioner: {}".format(pairRDD.partitioner))
print("Partition Sizes: {}".format(pairRDD.glom().map(len).collect()))
print("Partitions structure: {}".format(pairRDD.glom().collect()))

Number of partitions: 1
Partitioner: None
Partition Sizes: [100]
Partitions structure: [[(u'5', [u'skirts', u'2019-10-07', u'$85.03', u'2']), (u'1', [u'skirts', u'2020-06-18', u'$79.17', u'9']), (u'3', [u'pants', u'2020-03-18', u'$33.63', u'4']), (u'4', [u'ties', u'2020-01-03', u'$20.47', u'9']), (u'3', [u'pants', u'2020-10-06', u'$7.36', u'1']), (u'5', [u'ties', u'2020-10-22', u'$26.19', u'2']), (u'2', [u'shirts', u'2019-11-06', u'$63.90', u'4']), (u'3', [u'shirts', u'2020-01-28', u'$11.93', u'4']), (u'4', [u'skirts', u'2021-03-18', u'$15.81', u'2']), (u'5', [u'skirts', u'2020-01-01', u'$46.78', u'4']), (u'4', [u'shirts', u'2019-07-20', u'$90.89', u'9']), (u'4', [u'pants', u'2019-06-28', u'$41.24', u'6']), (u'3', [u'pants', u'2019-11-11', u'$30.10', u'8']), (u'3', [u'ties', u'2021-02-23', u'$9.55', u'6']), (u'2', [u'shirts', u'2021-05-04', u'$3.35', u'8']), (u'3', [u'shirts', u'2020-07-27', u'$88.63', u'5']), (u'5', [u'pants', u'2019-10-28', u'$39.57', u'10']), (u'5', [u'ties', u'2020

In [6]:
# Create a new RDD partition by key
newpairRDD = pairRDD.partitionBy(5, lambda k: int(k[0]))
newpairRDD.take(2)

[(u'5', [u'skirts', u'2019-10-07', u'$85.03', u'2']),
 (u'5', [u'ties', u'2020-10-22', u'$26.19', u'2'])]

In [7]:
# check partition lengths. By default hash partitioner is used
print("Number of partitions: {}".format(newpairRDD.getNumPartitions()))
print("Partitioner: {}".format(newpairRDD.partitioner))
print("Partition Sizes: {}".format(newpairRDD.glom().map(len).collect()))
print("Partitions structure: {}".format(newpairRDD.glom().collect()))

Number of partitions: 5
Partitioner: <pyspark.rdd.Partitioner object at 0x7fd49242cc10>
Partition Sizes: [11, 17, 28, 20, 24]
Partitions structure: [[(u'5', [u'skirts', u'2019-10-07', u'$85.03', u'2']), (u'5', [u'ties', u'2020-10-22', u'$26.19', u'2']), (u'5', [u'skirts', u'2020-01-01', u'$46.78', u'4']), (u'5', [u'pants', u'2019-10-28', u'$39.57', u'10']), (u'5', [u'ties', u'2020-10-30', u'$83.10', u'1']), (u'5', [u'skirts', u'2020-06-10', u'$17.13', u'1']), (u'5', [u'shirts', u'2020-12-26', u'$10.43', u'7']), (u'5', [u'ties', u'2020-09-04', u'$65.36', u'7']), (u'5', [u'shirts', u'2019-10-08', u'$95.85', u'8']), (u'5', [u'ties', u'2021-04-17', u'$81.76', u'6']), (u'5', [u'skirts', u'2020-02-14', u'$71.10', u'9'])], [(u'1', [u'skirts', u'2020-06-18', u'$79.17', u'9']), (u'1', [u'ties', u'2019-07-04', u'$71.17', u'6']), (u'1', [u'ties', u'2019-11-16', u'$5.27', u'7']), (u'1', [u'skirts', u'2020-11-03', u'$57.93', u'9']), (u'1', [u'pants', u'2020-05-14', u'$24.86', u'3']), (u'1', [u'skir

In [8]:
# Define a function for custom partitoner
def product_partitioner(product):
    return hash(product)

In [9]:
# Create a new RDD with custom partitioner
custompairRDD = pairRDD.map(lambda x :(x[1][0],x[1][1:])).partitionBy(3, product_partitioner)
custompairRDD.take(2)

[(u'shirts', [u'2019-11-06', u'$63.90', u'4']),
 (u'shirts', [u'2020-01-28', u'$11.93', u'4'])]

In [10]:
# check partition lengths. 
print("Number of partitions: {}".format(custompairRDD.getNumPartitions()))
print("Partitioner: {}".format(custompairRDD.partitioner))
print("Partition Sizes: {}".format(custompairRDD.glom().map(len).collect()))
print("Partitions structure: {}".format(custompairRDD.glom().collect()))

Number of partitions: 3
Partitioner: <pyspark.rdd.Partitioner object at 0x7fd4917e2210>
Partition Sizes: [22, 26, 52]
Partitions structure: [[(u'shirts', [u'2019-11-06', u'$63.90', u'4']), (u'shirts', [u'2020-01-28', u'$11.93', u'4']), (u'shirts', [u'2019-07-20', u'$90.89', u'9']), (u'shirts', [u'2021-05-04', u'$3.35', u'8']), (u'shirts', [u'2020-07-27', u'$88.63', u'5']), (u'shirts', [u'2020-12-26', u'$10.43', u'7']), (u'shirts', [u'2020-04-09', u'$36.19', u'6']), (u'shirts', [u'2020-02-06', u'$79.96', u'5']), (u'shirts', [u'2019-10-24', u'$8.20', u'8']), (u'shirts', [u'2020-11-16', u'$16.02', u'5']), (u'shirts', [u'2019-06-10', u'$37.57', u'2']), (u'shirts', [u'2020-09-24', u'$52.79', u'4']), (u'shirts', [u'2020-12-29', u'$72.69', u'2']), (u'shirts', [u'2021-02-21', u'$68.51', u'1']), (u'shirts', [u'2019-08-11', u'$45.42', u'7']), (u'shirts', [u'2020-10-26', u'$42.69', u'8']), (u'shirts', [u'2020-10-11', u'$65.93', u'6']), (u'shirts', [u'2019-11-06', u'$59.07', u'4']), (u'shirts', [u

In [11]:
# Reparttion RDD's
mergedrdd=custompairRDD.repartition(2)

In [12]:
# check partition lengths. 
print("Number of partitions: {}".format(mergedrdd.getNumPartitions()))
print("Partitioner: {}".format(mergedrdd.partitioner))
print("Partition Sizes: {}".format(mergedrdd.glom().map(len).collect()))
print("Partitions structure: {}".format(mergedrdd.glom().collect()))

Number of partitions: 2
Partitioner: None
Partition Sizes: [54, 46]
Partitions structure: [[(u'shirts', [u'2019-11-06', u'$63.90', u'4']), (u'shirts', [u'2020-01-28', u'$11.93', u'4']), (u'shirts', [u'2019-07-20', u'$90.89', u'9']), (u'shirts', [u'2021-05-04', u'$3.35', u'8']), (u'shirts', [u'2020-07-27', u'$88.63', u'5']), (u'shirts', [u'2020-12-26', u'$10.43', u'7']), (u'shirts', [u'2020-08-06', u'$47.04', u'4']), (u'shirts', [u'2019-10-08', u'$95.85', u'8']), (u'shirts', [u'2019-11-30', u'$62.08', u'7']), (u'shirts', [u'2021-05-09', u'$45.97', u'10']), (u'pants', [u'2020-03-18', u'$33.63', u'4']), (u'pants', [u'2020-10-06', u'$7.36', u'1']), (u'pants', [u'2019-06-28', u'$41.24', u'6']), (u'pants', [u'2019-11-11', u'$30.10', u'8']), (u'pants', [u'2019-10-28', u'$39.57', u'10']), (u'pants', [u'2019-10-30', u'$30.43', u'1']), (u'pants', [u'2019-09-01', u'$63.32', u'1']), (u'pants', [u'2020-10-25', u'$32.76', u'4']), (u'pants', [u'2021-03-17', u'$85.46', u'2']), (u'pants', [u'2020-01-29