In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Start Spark Session
spark = SparkSession.builder.appName("LargerDataFrames").getOrCreate()

# Employee Data (emp)
emp_data = [
    (1, "Alice", 101, 60000),
    (2, "Bob", 102, 70000),
    (3, "Charlie", 103, 50000),
    (4, "David", 101, 45000),
    (5, "Eva", 104, 75000),
    (6, "Frank", 102, 72000),
    (7, "Grace", 103, 48000),
    (8, "Hank", 105, 80000),
    (9, "Ivy", 106, 67000),
    (10, "Jack", 101, 51000),
    (11, "Karen", 102, 62000),
    (12, "Leo", 104, 59000),
    (13, "Mona", 105, 85000),
    (14, "Nate", 106, 64000),
    (15, "Olivia", 101, 56000)
]

emp_schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("emp_name", StringType(), True),
    StructField("dept_id", IntegerType(), True),
    StructField("salary", IntegerType(), True)
])

emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

# Department Data (dept)
dept_data = [
    (101, "HR"),
    (102, "Engineering"),
    (103, "Marketing"),
    (104, "Finance"),
    (105, "Sales"),
    (106, "IT")
]

dept_schema = StructType([
    StructField("dept_id", IntegerType(), True),
    StructField("dept_name", StringType(), True)
])

dept = spark.createDataFrame(data=dept_data, schema=dept_schema)

In [0]:
join_df = emp.join(dept, emp.dept_id == dept.dept_id).explain()

# = Physical Plan ==
# AdaptiveSparkPlan isFinalPlan=false
# +- SortMergeJoin [dept_id#216], [dept_id#222], Inner
#    :- Sort [dept_id#216 ASC NULLS FIRST], false, 0
#    :  +- Exchange hashpartitioning(dept_id#216, 200), ENSURE_REQUIREMENTS, [plan_id=391]
#    :     +- Filter isnotnull(dept_id#216)
#    :        +- Scan ExistingRDD[emp_id#214,emp_name#215,dept_id#216,salary#217]
#    +- Sort [dept_id#222 ASC NULLS FIRST], false, 0
#       +- Exchange hashpartitioning(dept_id#222, 200), ENSURE_REQUIREMENTS, [plan_id=392]
#          +- Filter isnotnull(dept_id#222)
#             +- Scan ExistingRDD[dept_id#222,dept_name#223]

In [0]:
from pyspark.sql.functions import broadcast

join_df_broadcast = emp.join(broadcast(dept), on="dept_id", how="inner").explain()

# == Physical Plan ==
# AdaptiveSparkPlan isFinalPlan=false
# +- Project [dept_id#216, emp_id#214, emp_name#215, salary#217, dept_name#223]
#    +- BroadcastHashJoin [dept_id#216], [dept_id#222], Inner, BuildRight, false, true
#       :- Filter isnotnull(dept_id#216)
#       :  +- Scan ExistingRDD[emp_id#214,emp_name#215,dept_id#216,salary#217]
#       +- Exchange SinglePartition, EXECUTOR_BROADCAST, [plan_id=474]
#          +- Filter isnotnull(dept_id#222)
#             +- Scan ExistingRDD[dept_id#222,dept_name#223]