In [0]:
spark.conf.set("spark.sql.adaptive.enabled", "false")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "false")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, rand, floor

spark = SparkSession.builder.appName("SkewJoinExample").getOrCreate()

# Create departments DataFrame
departments = spark.range(1, 21).withColumnRenamed("id", "department_id")
departments = departments.withColumn("department_name", 
    col("department_id").cast("string").alias("department_name")
)

# departments.write.mode("overwrite").saveAsTable("departments")

In [0]:
from pyspark.sql.functions import when

# Create a large number of employees, with skew on department_id = 1
num_employees = 1_000_000

employees = (
    spark.range(1, num_employees + 1)
    .withColumnRenamed("id", "employee_id")
    .withColumn("department_id", 
        when(rand() < 0.6, lit(1))  # 60% of employees in department 1 (skew)
        .otherwise(floor(rand() * 20 + 1))
    )
    .withColumn("employee_name", 
        col("employee_id").cast("string")
    )
)

# employees.write.mode("overwrite").saveAsTable("employees")


In [0]:
join_df = employees.join(departments, on="department_id", how="inner")

In [0]:
display(join_df)

In [0]:
join_df2 = employees.hint("skew", "department_id").join(departments, "department_id")

In [0]:
display(join_df2)