<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/10-misc_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Miscellaneos Performance tricks
- cache() & persist()
- broadcast join
- repartition & coalesce
- explain

# Setting up PySpark

In [None]:
%pip install pyspark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').getOrCreate()

# Preparing data

In [2]:
from pyspark import SparkFiles
from pyspark.sql.types import *

# Setting up URLs
squirrel_url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv"
park_url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/park-data.csv"


# Defining schemas
squirrel_schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

park_schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Date', StringType(), True),
StructField('Start Time', StringType(), True),
StructField('End Time', StringType(), True),
StructField('Total Time (in minutes, if available)', StringType(), True),
StructField('Park Conditions', StringType(), True),
StructField('Other Animal Sightings', StringType(), True),
StructField('Litter', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Temperature & Weather', StringType(), True),
StructField('Number of Squirrels', IntegerType(), True),
StructField('Squirrel Sighter(s)', StringType(), True),
StructField('Number of Sighters', IntegerType(), True)
])

area_schema = StructType([
StructField('Area ID',StringType(),True),
StructField('Area Name',StringType(),True),
StructField('Area Description',StringType(),True),
StructField('City Name',StringType(),True),
])

area_data = [
    ("A", "UPPER MANHATTAN", "Uptown Manhattan", "New York"),
    ("B", "CENTRAL MANHATTAN", "Midtown Manhattan", "New York"),
    ("C", "LOWER MANHATTAN", "Downtown Manhattan", "New York"),
    ("D", "BROOKLYN", "Brooklyn", "New York")
    ]

spark.sparkContext.addFile(squirrel_url)
spark.sparkContext.addFile(park_url)

# creating dataframes
squirrel = spark.read.csv(SparkFiles.get("squirrel-data.csv"), header=True, schema=squirrel_schema)
park = spark.read.csv(SparkFiles.get("park-data.csv"), header=True, schema=park_schema)
area = spark.createDataFrame(data=area_data, schema=area_schema)

In [3]:
# show data
squirrel.show()
park.show()
area.show()

+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-01|             Gray|                  White|       NULL|Ground Plane| 

# Caching & Persist

In [4]:
# Caching
# Default: MEMORY_AND_DISK

import uuid
from pyspark.sql.functions import udf

@udf
def generate_uuid():
  return str(uuid.uuid4())

# transformation 1
squirrel = squirrel.withColumn("hash_id", generate_uuid())

# transformation 2
squirrel = squirrel.dropDuplicates()

# squirrel.cache().count() <--------------- force an action to run the cache

# transformations N
# squirrel = squirrel.join...
# squirrel = squirrel.groupBy...

# DAG
# T1 -> T2 -> T3...TN -> A1

# action 1
# squirrel.write.format("parquet").path("path")


In [5]:
squirrel

DataFrame[Area Name: string, Area ID: string, Park Name: string, Park ID: string, Squirrel ID: string, Primary Fur Color: string, Highlights in Fur Color: string, Color Notes: string, Location: string, Above Ground (Height in Feet): string, Specific Location: string, Activities: string, Interactions with Humans: string, Squirrel Latitude (DD.DDDDDD): string, Squirrel Longitude (-DD.DDDDDD): string, hash_id: string]

In [6]:
squirrel.show()

+-----------------+-------+--------------------+-------+-----------+-----------------+-----------------------+--------------------+--------------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|        Area Name|Area ID|           Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|         Color Notes|            Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|             hash_id|
+-----------------+-------+--------------------+-------+-----------+-----------------+-----------------------+--------------------+--------------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|CENTRAL M

In [7]:
squirrel.cache().count()

433

In [8]:
squirrel.is_cached

True

In [9]:
squirrel.show()

+-----------------+-------+--------------------+-------+-----------+-----------------+-----------------------+-----------+--------------------+-----------------------------+--------------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|        Area Name|Area ID|           Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|            Location|Above Ground (Height in Feet)|   Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|             hash_id|
+-----------------+-------+--------------------+-------+-----------+-----------------+-----------------------+-----------+--------------------+-----------------------------+--------------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|  UPPER MANHATTAN|      A| 

In [10]:
#retirada da cache
squirrel.unpersist()

squirrel.is_cached

False

- Logical plan: user's code or query and is independent of the underlying data sources and execution strategies

- Physical Plan: The physical plan represents the actual execution steps that Spark will perform to execute the job on the cluster.

In [11]:
squirrel.explain("cost")

== Optimized Logical Plan ==
Aggregate [Above Ground (Height in Feet)#9, Primary Fur Color#5, hash_id#247, Location#8, Park ID#3, Specific Location#10, Squirrel ID#4, Area ID#1, Activities#11, Squirrel Latitude (DD.DDDDDD)#13, Color Notes#7, Area Name#0, Highlights in Fur Color#6, Interactions with Humans#12, Squirrel Longitude (-DD.DDDDDD)#14, Park Name#2], [Area Name#0, Area ID#1, Park Name#2, Park ID#3, Squirrel ID#4, Primary Fur Color#5, Highlights in Fur Color#6, Color Notes#7, Location#8, Above Ground (Height in Feet)#9, Specific Location#10, Activities#11, Interactions with Humans#12, Squirrel Latitude (DD.DDDDDD)#13, Squirrel Longitude (-DD.DDDDDD)#14, hash_id#247], Statistics(sizeInBytes=57.7 KiB)
+- Project [Area Name#0, Area ID#1, Park Name#2, Park ID#3, Squirrel ID#4, Primary Fur Color#5, Highlights in Fur Color#6, Color Notes#7, Location#8, Above Ground (Height in Feet)#9, Specific Location#10, Activities#11, Interactions with Humans#12, Squirrel Latitude (DD.DDDDDD)#13, S

In [12]:
from pyspark import StorageLevel

squirrel.cache()

DataFrame[Area Name: string, Area ID: string, Park Name: string, Park ID: string, Squirrel ID: string, Primary Fur Color: string, Highlights in Fur Color: string, Color Notes: string, Location: string, Above Ground (Height in Feet): string, Specific Location: string, Activities: string, Interactions with Humans: string, Squirrel Latitude (DD.DDDDDD): string, Squirrel Longitude (-DD.DDDDDD): string, hash_id: string]

In [13]:
# Persist
# Default: MEMORY_ONLY
from pyspark.sql.functions import *
from pyspark import StorageLevel

# first execution plan
print(area.explain("cost"))
print("----------------")

area = area.withColumn("City shortname", lit("NY"))
# second execution plan
print(area.explain("cost"))
print("----------------")

area = area.persist(StorageLevel.MEMORY_ONLY)
area.count()

# second execution plan
area2 = area.withColumn("Teste", lit("test"))
print(area2.explain("cost"))
print("----------------")

print(area.storageLevel)
print(area.is_cached)

== Optimized Logical Plan ==
LogicalRDD [Area ID#62, Area Name#63, Area Description#64, City Name#65], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
*(1) Scan ExistingRDD[Area ID#62,Area Name#63,Area Description#64,City Name#65]


None
----------------
== Optimized Logical Plan ==
Project [Area ID#62, Area Name#63, Area Description#64, City Name#65, NY AS City shortname#1329], Statistics(sizeInBytes=9.8 EiB)
+- LogicalRDD [Area ID#62, Area Name#63, Area Description#64, City Name#65], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
*(1) Project [Area ID#62, Area Name#63, Area Description#64, City Name#65, NY AS City shortname#1329]
+- *(1) Scan ExistingRDD[Area ID#62,Area Name#63,Area Description#64,City Name#65]


None
----------------
== Optimized Logical Plan ==
Project [Area ID#62, Area Name#63, Area Description#64, City Name#65, City shortname#1329, test AS Teste#1495], Statistics(sizeInBytes=282.0 B)
+- InMemoryRelation [Area ID#62, Area Name#63, Area Des

In [14]:
# Persist
# Default: MEMORY_AND_DISK

from pyspark.sql.functions import *
from pyspark import StorageLevel

# first execution plan
print(area.explain("cost"))

area = area.withColumn("City shortname", lit("NY"))
# second execution plan
print(area.explain("cost"))

area = area.persist(StorageLevel.DISK_ONLY)
area.count()

# second execution plan
area2 = area.withColumn("Teste", lit("test"))
print(area2.explain("cost"))

print(area.storageLevel)
print(area.is_cached)

== Optimized Logical Plan ==
Project [Area ID#62, Area Name#63, Area Description#64, City Name#65, NY AS City shortname#1329], Statistics(sizeInBytes=9.8 EiB)
+- LogicalRDD [Area ID#62, Area Name#63, Area Description#64, City Name#65], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
*(1) Project [Area ID#62, Area Name#63, Area Description#64, City Name#65, NY AS City shortname#1329]
+- *(1) Scan ExistingRDD[Area ID#62,Area Name#63,Area Description#64,City Name#65]


None
== Optimized Logical Plan ==
Project [Area ID#62, Area Name#63, Area Description#64, City Name#65, NY AS City shortname#1577], Statistics(sizeInBytes=238.0 B)
+- InMemoryRelation [Area ID#62, Area Name#63, Area Description#64, City Name#65, City shortname#1329], StorageLevel(memory, 1 replicas), Statistics(sizeInBytes=238.0 B, rowCount=4)
      +- *(1) Project [Area ID#62, Area Name#63, Area Description#64, City Name#65, NY AS City shortname#1329]
         +- *(1) Scan ExistingRDD[Area ID#62,Area Name#63,Ar

# Broadcast Join

In [15]:
# Broadcast join
# identify the tables candidates for broadcast (smaller one)

join_df = (squirrel
           .join(park, on="Park ID", how="inner")
           .join(area, on="Area ID", how="inner")
           .select(area["Area Description"], park["Park Name"], park["Date"], squirrel["Squirrel ID"])
           )

join_df.explain()
join_df.show()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [Area Description#64, Park Name#32, Date#34, Squirrel ID#4]
   +- BroadcastHashJoin [Area ID#1], [Area ID#62], Inner, BuildRight, false
      :- Project [Area ID#1, Squirrel ID#4, Park Name#32, Date#34]
      :  +- BroadcastHashJoin [Park ID#3], [Park ID#33], Inner, BuildRight, false
      :     :- Filter (isnotnull(Park ID#3) AND isnotnull(Area ID#1))
      :     :  +- InMemoryTableScan [Area ID#1, Park ID#3, Squirrel ID#4], [isnotnull(Park ID#3), isnotnull(Area ID#1)]
      :     :        +- InMemoryRelation [Area Name#0, Area ID#1, Park Name#2, Park ID#3, Squirrel ID#4, Primary Fur Color#5, Highlights in Fur Color#6, Color Notes#7, Location#8, Above Ground (Height in Feet)#9, Specific Location#10, Activities#11, Interactions with Humans#12, Squirrel Latitude (DD.DDDDDD)#13, Squirrel Longitude (-DD.DDDDDD)#14, hash_id#247], StorageLevel(disk, memory, deserialized, 1 replicas)
      :     :              +- AdaptiveSpar

In [20]:
import pyspark.sql.functions as F

join_df = (squirrel
           .join(park, on="Park ID", how="inner")
           .join(F.broadcast(area), on="Area ID", how="inner")
           .select(area["Area Description"], park["Park Name"], park["Date"], squirrel["Squirrel ID"])
           )

join_df.explain()
join_df.show()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [Area Description#64, Park Name#32, Date#34, Squirrel ID#4]
   +- BroadcastHashJoin [Area ID#1], [Area ID#62], Inner, BuildRight, false
      :- Project [Area ID#1, Squirrel ID#4, Park Name#32, Date#34]
      :  +- BroadcastHashJoin [Park ID#3], [Park ID#33], Inner, BuildRight, false
      :     :- Filter (isnotnull(Park ID#3) AND isnotnull(Area ID#1))
      :     :  +- InMemoryTableScan [Area ID#1, Park ID#3, Squirrel ID#4], [isnotnull(Park ID#3), isnotnull(Area ID#1)]
      :     :        +- InMemoryRelation [Area Name#0, Area ID#1, Park Name#2, Park ID#3, Squirrel ID#4, Primary Fur Color#5, Highlights in Fur Color#6, Color Notes#7, Location#8, Above Ground (Height in Feet)#9, Specific Location#10, Activities#11, Interactions with Humans#12, Squirrel Latitude (DD.DDDDDD)#13, Squirrel Longitude (-DD.DDDDDD)#14, hash_id#247], StorageLevel(disk, memory, deserialized, 1 replicas)
      :     :              +- AdaptiveSpar

# Repartition & Coalesce

- coalesce is for reducing partitions without shuffling
- repartition is for distributing data evenly across the cluster for better parallelism

- if possible choose coalesce over repartition
- if needed to increase partitions to increase parallelism, use repartition, however keep the data shuffling operation in mind



In [18]:
squirrel_1 = squirrel
squirrel_2 = squirrel

# Check partitions
squirrel_1.rdd.getNumPartitions()

# RDD -> partitions among the workers

1

In [19]:
# repartition
# evenly distribute date across partitions for better parallel processing efficiency
# increase AND reduce partitions
# do shuffling

print(f"before repartition: {squirrel_1.rdd.getNumPartitions()}")
squirrel_1 = squirrel_1.repartition(4)
print(f"after repartition: {squirrel_1.rdd.getNumPartitions()}")

before repartition: 1
after repartition: 4


In [25]:
squirrel_1.write.format("parquet").mode("overwrite").save("/content/files/squirrel_1")

In [23]:
squirrel_1.rdd.getNumPartitions()

4

In [27]:
squirrel_1= squirrel_1.coalesce(1)

In [29]:
# coalesce
# reduce partitions without shuffling
# minimizes data movement across the cluster

# does not allow to increase partitions, only reduce
print(f"before coalesce: {squirrel_2.rdd.getNumPartitions()}")
squirrel_2 = squirrel_2.coalesce(5)
print(f"after coalesce: {squirrel_2.rdd.getNumPartitions()}")


before coalesce: 5
after coalesce: 5


In [30]:
print(f"before coalesce: {squirrel_1.rdd.getNumPartitions()}")
squirrel_1 = squirrel_1.coalesce(2)
print(f"after coalesce: {squirrel_1.rdd.getNumPartitions()}")

before coalesce: 1
after coalesce: 2


In [62]:
# repartition/coalesce and writing data
!rm -rf /content/files/area
!mkdir -p /content/files/area

# repartition "area" dataframe and write as parquet
area.repartition(3).write.format("parquet").mode("overwrite").save("/content/files/area")

In [32]:
# check files and their content

files = !ls /content/files/area/ | grep ".parquet"
folder = "/content/files/area/"

for f in files:
  df = spark.read.parquet(f"{folder}{f}")
  print(f"{f} - {df.count()} rows")

part-00000-e048b046-896c-48c3-83f2-f1ab87d909a1-c000.snappy.parquet - 2 rows
part-00001-e048b046-896c-48c3-83f2-f1ab87d909a1-c000.snappy.parquet - 1 rows
part-00002-e048b046-896c-48c3-83f2-f1ab87d909a1-c000.snappy.parquet - 1 rows


In [33]:
# Check file sizes
!ls /content/files/area

part-00000-e048b046-896c-48c3-83f2-f1ab87d909a1-c000.snappy.parquet
part-00001-e048b046-896c-48c3-83f2-f1ab87d909a1-c000.snappy.parquet
part-00002-e048b046-896c-48c3-83f2-f1ab87d909a1-c000.snappy.parquet
_SUCCESS


# Question

In [None]:
# Q1
# read data from /content/files/area (3 parquet files)
# write again the data into the same folder making sure the output will be only one file

In [63]:
area.repartition(3).write.format("parquet").mode("overwrite").save("/content/files/area")
df = spark.read.parquet("/content/files/area")
df = df.repartition(1)
df.write.format("parquet").mode("overwrite").save("/content/files/area")