### Broadcast

It is a read only shared variables that are cached and available to all nodes in the cluster in-order to use and access by all tasks. Pyspark distribute broadcast variable to the workers using efficient broadcast algorithm to reduce communication cost

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('learning').getOrCreate()

In [None]:
global_data = {'1':'one','2':'two','3':'three'}
broadcastData = spark.sparkContext.broadcast(global_data)
data = [('padhu','1'),('sam','2')]
schema = ['name','count']


### Working with RDD

In [None]:
# Creating RDD
rdd = spark.sparkContext.parallelize(data,3)
rdd.getNumPartitions()

3

In [None]:
# mapping each record in the data

stage = rdd.map(lambda x: (x[0],broadcastData.value[x[1]]))
result = stage.collect()
print(result)

[('padhu', 'one'), ('sam', 'two')]


### Working with DataFrame

In [None]:
# creating dataframe

df = spark.createDataFrame(data,schema =schema )
df.rdd.getNumPartitions()

2

In [None]:
# map wont work on df directly, we need to convert df into rdd


from pyspark.sql.functions import col,udf

def getCount(count):
    return broadcastData.value[count]

getCountUDF = udf(getCount)

modified_df = df.select(df["name"],getCountUDF(df["count"]))
modified_df.show()

+-----+---------------+
| name|getCount(count)|
+-----+---------------+
|padhu|            one|
|  sam|            two|
+-----+---------------+



### BroadCast in the filter condition

In [None]:
list_of_keys = list(broadcastData.value.keys())

filterDF = df.where((df['count'].isin(list_of_keys)))
filterDF.show()

+-----+-----+
| name|count|
+-----+-----+
|padhu|    1|
|  sam|    2|
+-----+-----+



In [None]:
print(broadcastData.value.values())

print([k for k in broadcastData.value])

dict_values(['one', 'two', 'three'])
['1', '2', '3']
