<a href="https://colab.research.google.com/github/luasampaio/data-engineering/blob/main/40_Broadcast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Broadcasting DataFrames  em  PySpark

- Criado por : Luciana Sampaio
- 11.03.25



In [25]:
%time

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import BooleanType

# Initialize Spark session
spark = SparkSession.builder.appName("Exemplo 01").getOrCreate()

# Sample large DataFrame
large_df = spark.range(100)

# Broadcast variable example
broadcast_var = spark.sparkContext.broadcast([1, 2, 3, 4, 5])

# Function to filter data using broadcast variable
def filter_data(value):
    return value in broadcast_var.value

# Register UDF
filter_data_udf = udf(filter_data, BooleanType())

# Apply filter operation with broadcast variable
filtered_data = large_df.filter(filter_data_udf(col("id")))
filtered_data.show()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.44 µs
+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+



In [26]:
%time

from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

# Initialize Spark session
spark = SparkSession.builder.appName("Broadcast Join Example").getOrCreate()

# Sample small DataFrame
small_df = spark.createDataFrame([(1, "A"), (2, "B"), (3, "C")], ["id", "value"])

# Sample large DataFrame
large_df = spark.range(1000).toDF("id")

# Perform broadcast join
joined_df = large_df.join(broadcast(small_df), "id")

# Show the results
joined_df.show()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs
+---+-----+
| id|value|
+---+-----+
|  1|    A|
|  2|    B|
|  3|    C|
+---+-----+



In [8]:
large_df.count()

1000

In [9]:
large_df.printSchema()

root
 |-- id: long (nullable = false)



In [11]:
large_df.show(3)

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+
only showing top 3 rows



In [14]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Broadcasting Example").getOrCreate()

# Create a small DataFrame with product categories and discounts
category_data = [("A", 10), ("B", 20), ("C", 30)]
category_df = spark.createDataFrame(category_data, ["category", "discount"])

In [15]:
from pyspark.sql.functions import broadcast

broadcast_category_df = broadcast(category_df)

In [16]:
broadcast_category_df.show()

+--------+--------+
|category|discount|
+--------+--------+
|       A|      10|
|       B|      20|
|       C|      30|
+--------+--------+



Agora usando efetivamente o broadcast

In [17]:
# Create a larger DataFrame with sales data
sales_data = [("product1", "A", 100), ("product2", "B", 200), ("product3", "C", 300)]
sales_df = spark.createDataFrame(sales_data, ["product", "category", "revenue"])

# Join the sales DataFrame with the broadcasted category DataFrame
result_df = sales_df.join(broadcast_category_df, on="category")

In [18]:
result_df.show()

+--------+--------+-------+--------+
|category| product|revenue|discount|
+--------+--------+-------+--------+
|       A|product1|    100|      10|
|       B|product2|    200|      20|
|       C|product3|    300|      30|
+--------+--------+-------+--------+



In [19]:
sales_df.show()

+--------+--------+-------+
| product|category|revenue|
+--------+--------+-------+
|product1|       A|    100|
|product2|       B|    200|
|product3|       C|    300|
+--------+--------+-------+



In [21]:
broadcast_category_df.show()

+--------+--------+
|category|discount|
+--------+--------+
|       A|      10|
|       B|      20|
|       C|      30|
+--------+--------+

