In [1]:
# enable pyspark
import findspark
findspark.init()

In [2]:
'''
Scripts instantiates a SparkSession locally with 8 worker threads.
'''
appName = "Broadcast variables"
master = "local[8]"
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# Create Spark session
conf = SparkConf().setMaster(master).setAppName(appName)
spark = SparkSession.builder.config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()
# INFO/WARN/DEBUG
# https://kontext.tech/column/spark/457/tutorial-turn-off-info-logs-in-spark
spark.sparkContext.setLogLevel("INFO")

### Broadcast Variables
In PySpark Broadcast variables are read-only shared variables that are cached and available on all nodes in
a cluster to be used by the tasks. **Instead of sending this data along with every task, pySpark caches the broadcast
variable - this lookup info, on each node/machine**. The tasks use this cached info while executing the transformations. Each
node/executor depending on number of cores could be running lots of tasks(spark recommends 2-3 tasks per CPU core).


**PySpark RDD Broadcast variable example**
Below is a very simple example of how to use broadcast variables on RDD. This example defines commonly used data (states) in a Map variable and distributes the variable using SparkContext.broadcast() and then use these variables on RDD map() transformation.

ref: https://sparkbyexamples.com/pyspark/pyspark-broadcast-variables/

In [4]:
states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

rdd = spark.sparkContext.parallelize(data)

def state_convert(code):
    return broadcastStates.value[code]

result = rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).collect()
print(result)


[('James', 'Smith', 'USA', 'California'), ('Michael', 'Rose', 'USA', 'New York'), ('Robert', 'Williams', 'USA', 'California'), ('Maria', 'Jones', 'USA', 'Florida')]
