In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Initialize Spark Session (if not already running)
# spark = SparkSession.builder.appName("SalaryDataframes").getOrCreate()

# 1. Define the Schema
employee_schema = StructType([
    StructField("SalaryDataID", IntegerType(), True),
    StructField("CalendarYear", IntegerType(), True),
    StructField("EmployeeName", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("JobTitle", StringType(), True),
    StructField("salary", DoubleType(), True)
])

# 2. Define the Data
employee_data = [
    (1, 2008, 'Glover, Eugenia', 'Louisville Metro Police', 'Police Officer', 45952.98),
    (2, 2008, 'Embry, James', 'Louisville Metro Police', 'Police Officer', 37221.94),
    (3, 2008, 'Detalente, Frank', 'Louisville Metro Police', 'Police Officer', 70366.42),
    (4, 2008, 'Passafiume, Donald', 'Louisville Metro Police', 'Police Officer', 55602.99),
    (5, 2008, 'Hendricks, Maurice', 'Louisville Metro Police', 'Police Officer', 56916.34),
    (6, 2008, 'Maye, Barbara', 'Neighborhoods', 'Business Specialist', 34904.11),
    (7, 2008, 'Compton, Terry', 'Louisville Metro Police', 'Police Officer', 62047.26),
    (8, 2008, 'Hawkins, Benton', 'Louisville Metro Police', 'Police Officer', 57590.21),
    (9, 2008, 'Sanders, Rebecca', 'Louisville Metro Police', 'Police Officer', 24886.39),
    (10, 2008, 'Utsey, Darren', 'Louisville Metro Police', 'Police Officer', 59775.4),
    (11, 2008, 'Barry, Elva', 'Parks&Recreation', 'Business Specialist', 35937.08),
    (12, 2008, 'Haworth, Jessica', 'Louisville Metro Police', 'Police Officer', 56886.21),
    (13, 2008, 'Hanifen, Patricia', 'Louisville Metro Police', 'Police Officer', 23061.67)
]

# 3. Create the DataFrame
df_employees = spark.createDataFrame(employee_data, schema=employee_schema)
df_employees.show()



# 1. Define the Schema
bonus_schema = StructType([
    StructField("min_salary", IntegerType(), True),
    StructField("max_salary", IntegerType(), True),
    StructField("bonus_percent", DoubleType(), True)
])

# 2. Define the Data
bonus_data = [
    (1, 20000, 2.0),
    (20001, 30000, 3.0),
    (30001, 50000, 3.5),
    (50001, 65000, 3.7),
    (65001, 100000, 3.9)
]

# 3. Create the DataFrame
df_bonus = spark.createDataFrame(bonus_data, schema=bonus_schema)
df_bonus.show()


+------------+------------+------------------+--------------------+-------------------+--------+
|SalaryDataID|CalendarYear|      EmployeeName|          Department|           JobTitle|  salary|
+------------+------------+------------------+--------------------+-------------------+--------+
|           1|        2008|   Glover, Eugenia|Louisville Metro ...|     Police Officer|45952.98|
|           2|        2008|      Embry, James|Louisville Metro ...|     Police Officer|37221.94|
|           3|        2008|  Detalente, Frank|Louisville Metro ...|     Police Officer|70366.42|
|           4|        2008|Passafiume, Donald|Louisville Metro ...|     Police Officer|55602.99|
|           5|        2008|Hendricks, Maurice|Louisville Metro ...|     Police Officer|56916.34|
|           6|        2008|     Maye, Barbara|       Neighborhoods|Business Specialist|34904.11|
|           7|        2008|    Compton, Terry|Louisville Metro ...|     Police Officer|62047.26|
|           8|        2008|   

In [0]:
employees_df.alias("e").join(bonus_df.alias("b"), 
                            sf.col("e.salary").between(sf.col("b.min_salary"), sf.col("b.max_salary")), "left") \
            .explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- == Initial Plan ==
   ColumnarToRow
   +- PhotonResultStage
      +- PhotonBroadcastNestedLoopJoin LeftOuter, BuildRight, ((salary#13328 >= cast(min_salary#13339 as double)) AND (salary#13328 <= cast(max_salary#13340 as double)))
         :- PhotonRowToColumnar
         :  +- LocalTableScan [SalaryDataID#13323, CalendarYear#13324, EmployeeName#13325, Department#13326, JobTitle#13327, salary#13328, temp_col#13356]
         +- PhotonShuffleExchangeSource
            +- PhotonShuffleMapStage EXECUTOR_BROADCAST, [id=#8280]
               +- PhotonShuffleExchangeSink SinglePartition
                  +- PhotonRowToColumnar
                     +- LocalTableScan [min_salary#13339, max_salary#13340, bonus_percent#13341, temp_col#13353]


== Photon Explanation ==
The query is fully supported by Photon.


In [0]:
from pyspark.sql import functions as sf


bonus_df = df_bonus.withColumn("temp_col", sf.lit(1))
employees_df = df_employees.withColumn("temp_col", sf.lit(1))

employees_df.alias("e").join(bonus_df.alias("b"), 
                            (sf.col("e.salary").between(sf.col("b.min_salary"), sf.col("b.max_salary"))) & 
                            (sf.col("e.temp_col")==sf.col("b.temp_col")), "left")\
            .explain(True)


== Parsed Logical Plan ==
'Join LeftOuter, 'and('and('`>=`('e.salary, 'b.min_salary), '`<=`('e.salary, 'b.max_salary)), '`==`('e.temp_col, 'b.temp_col))
:- SubqueryAlias e
:  +- Project [SalaryDataID#13323, CalendarYear#13324, EmployeeName#13325, Department#13326, JobTitle#13327, salary#13328, 1 AS temp_col#13414]
:     +- LocalRelation [SalaryDataID#13323, CalendarYear#13324, EmployeeName#13325, Department#13326, JobTitle#13327, salary#13328]
+- SubqueryAlias b
   +- Project [min_salary#13339, max_salary#13340, bonus_percent#13341, 1 AS temp_col#13416]
      +- LocalRelation [min_salary#13339, max_salary#13340, bonus_percent#13341]

== Analyzed Logical Plan ==
SalaryDataID: int, CalendarYear: int, EmployeeName: string, Department: string, JobTitle: string, salary: double, temp_col: int, min_salary: int, max_salary: int, bonus_percent: double, temp_col: int
Join LeftOuter, (((salary#13328 >= cast(min_salary#13339 as double)) AND (salary#13328 <= cast(max_salary#13340 as double))) AND (

In [0]:
from pyspark.sql import functions as sf


bonus_df = df_bonus.withColumn("temp_col", sf.lit(1))
employees_df = df_employees.withColumn("temp_col", sf.lit(1))

employees_df.alias("e").join(bonus_df.alias("b"), 
                            (sf.col("e.salary").between(sf.col("b.min_salary"), sf.col("b.max_salary"))) & 
                            (sf.col("e.temp_col")==sf.col("b.temp_col")), "left")\
            .show()

+------------+------------+------------------+--------------------+-------------------+--------+--------+----------+----------+-------------+--------+
|SalaryDataID|CalendarYear|      EmployeeName|          Department|           JobTitle|  salary|temp_col|min_salary|max_salary|bonus_percent|temp_col|
+------------+------------+------------------+--------------------+-------------------+--------+--------+----------+----------+-------------+--------+
|           1|        2008|   Glover, Eugenia|Louisville Metro ...|     Police Officer|45952.98|       1|     30001|     50000|          3.5|       1|
|           3|        2008|  Detalente, Frank|Louisville Metro ...|     Police Officer|70366.42|       1|     65001|    100000|          3.9|       1|
|           2|        2008|      Embry, James|Louisville Metro ...|     Police Officer|37221.94|       1|     30001|     50000|          3.5|       1|
|           4|        2008|Passafiume, Donald|Louisville Metro ...|     Police Officer|55602.9

In [0]:
"""
Ex: In a real-time scenario, Letâ€™s consider one partition has 100K lines, and broadcasted table has 20K records.
So with the above calculation, for each partition, BroadcastNextedLoopJoin will iterate over 2,000,000,000 times. Which could lead to performance issues. Here we have considered that all partitions are evenly distributed(Maybe before this step, we have to perform other join or group by or window operations). Which is not the case all the time.

Spark will use Broadcast Hash Join if we add one equal condition in join like below.

(sf.col("tbl1_col1").between(sf.col("tbl2_col1"), sf.col("tbl2_col2"))) &((sf.col("tbl1_temp_col2")==sf.col("tab2_temp_col2"))

But how we will get this new column, which we can use in the joining condition and it does not impact the joining outcome.
Let us try to add default columns in both Dataframe and use it in the joining condition.

bonus_df = bonus_df.withColumn("temp_clm", sf.lit(1))
emp_df = emp_df.withColumn("temp_clm", sf.lit(1))
"""