TABLE 1

In [0]:
from pyspark.sql import SparkSession
import random
from datetime import datetime, timedelta

# Create a Spark session
spark = SparkSession.builder \
    .appName("Large DataFrame Example") \
    .getOrCreate()

# Sample function to generate random rows
def generate_row(i):
    return (
        i,
        f"user_{random.randint(1, 100000)}",
        round(random.uniform(100.0, 1000.0), 2),
        random.choice(["A", "B", "C", "D"]),
        datetime(2020, 1, 1) + timedelta(days=random.randint(0, 365 * 3))
    )

# Generate 1 million rows of data
data = [generate_row(i) for i in range(5_000_000)]

# Define schema
columns = ["id", "username", "purchase_amount", "category", "signup_date"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

In [0]:
df.write.format('parquet').mode("overwrite").save('dbfs:/user/hive/warehouse/default/parquet_table_1')

TABLE 2

In [0]:
from pyspark.sql import SparkSession
import random
from datetime import datetime, timedelta

# Create a Spark session
spark = SparkSession.builder \
    .appName("Large DataFrame Example") \
    .getOrCreate()

# Sample function to generate random rows
def generate_row(i):
    return (
        i,
        f"user_{random.randint(1, 100000)}",
        round(random.uniform(100.0, 1000.0), 2),
        random.choice(["A", "B", "C", "D"]),
        datetime(2020, 1, 1) + timedelta(days=random.randint(0, 365 * 3))
    )

# Generate 1 million rows of data
data = [generate_row(i) for i in range(3_000_000)]

# Define schema
columns = ["id", "username", "purchase_amount", "category", "signup_date"]

# Create DataFrame
df2 = spark.createDataFrame(data, schema=columns)

In [0]:
df2.write.format('parquet').mode("overwrite").save('dbfs:/user/hive/warehouse/default/parquet_table_2')

In [0]:
spark\
    .read.parquet('dbfs:/user/hive/warehouse/default/parquet_table_1')\
    .write.saveAsTable('hive_metastore.default.parquet_table_1')

In [0]:
spark\
    .read.parquet('dbfs:/user/hive/warehouse/default/parquet_table_2')\
    .write.saveAsTable('hive_metastore.default.parquet_table_2')

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/parquet_table_1

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/parquet_table_2/

In [0]:
df = spark.read.table('parquet_table_1')
df2 = spark.read.table('parquet_table_2')

In [0]:
df.join(df2, df.id == df2.id, 'inner').explain()

# == Physical Plan ==
# AdaptiveSparkPlan isFinalPlan=false
# +- SortMergeJoin [id#21L], [id#36L], Inner
#    :- Sort [id#21L ASC NULLS FIRST], false, 0
#    :  +- Exchange hashpartitioning(id#21L, 200), ENSURE_REQUIREMENTS, [plan_id=59]
#    :     +- Project [id#21L, username#22, purchase_amount#23, category#24, signup_date#25]
#    :        +- Filter (if (isnotnull(_databricks_internal_edge_computed_column_skip_row#351)) (_databricks_internal_edge_computed_column_skip_row#351 = false) else isnotnull(raise_error(DELTA_SKIP_ROW_COLUMN_NOT_FILLED, map(keys: [], values: []), NullType)) AND isnotnull(id#21L))
#    :           +- FileScan parquet dbw_lakehouse_dev.default.parquet_table_1[id#21L,username#22,purchase_amount#23,category#24,signup_date#25,_databricks_internal_edge_computed_column_skip_row#351] Batched: true, DataFilters: [isnotnull(id#21L)], Format: Parquet, Location: PreparedDeltaFileIndex(1 paths)[abfss://unity-catalog-storage@dbstorage24ijpia5cltgk.dfs.core.win..., PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:bigint,username:string,purchase_amount:double,category:string,signup_date:timestamp,_da...
#    +- Sort [id#36L ASC NULLS FIRST], false, 0
#       +- Exchange hashpartitioning(id#36L, 200), ENSURE_REQUIREMENTS, [plan_id=60]
#          +- Project [id#36L, username#37, purchase_amount#38, category#39, signup_date#40]
#             +- Filter (if (isnotnull(_databricks_internal_edge_computed_column_skip_row#352)) (_databricks_internal_edge_computed_column_skip_row#352 = false) else isnotnull(raise_error(DELTA_SKIP_ROW_COLUMN_NOT_FILLED, map(keys: [], values: []), NullType)) AND isnotnull(id#36L))
#                +- FileScan parquet dbw_lakehouse_dev.default.parquet_table_2[id#36L,username#37,purchase_amount#38,category#39,signup_date#40,_databricks_internal_edge_computed_column_skip_row#352] Batched: true, DataFilters: [isnotnull(id#36L)], Format: Parquet, Location: PreparedDeltaFileIndex(1 paths)[abfss://unity-catalog-storage@dbstorage24ijpia5cltgk.dfs.core.win..., PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:bigint,username:string,purchase_amount:double,category:string,signup_date:timestamp,_da...


Bucket

In [0]:
df.write.format("parquet") \
    .mode("overwrite") \
    .bucketBy(6, "id") \
    .saveAsTable("hive_metastore.default.bucket_parquet_table_1", path="dbfs:/user/hive/warehouse/default/bucket_parquet_table_1")

In [0]:
df2.write.format("parquet") \
    .mode("overwrite") \
    .bucketBy(6, "id") \
    .saveAsTable("hive_metastore.default.bucket_parquet_table_2", path="dbfs:/user/hive/warehouse/default/bucket_parquet_table_2")

In [0]:
bucket_parquet_table_1 = spark.table("hive_metastore.default.bucket_parquet_table_1")
bucket_parquet_table_2 = spark.table("hive_metastore.default.bucket_parquet_table_2")

In [0]:
bucket_parquet_table_1.join(bucket_parquet_table_2, bucket_parquet_table_1.id == bucket_parquet_table_2.id, 'inner').explain()

# == Physical Plan ==
# AdaptiveSparkPlan isFinalPlan=false
# +- SortMergeJoin [id#3238L], [id#3248L], Inner
#    :- Sort [id#3238L ASC NULLS FIRST], false, 0
#    :  +- Filter isnotnull(id#3238L)
#    :     +- FileScan parquet hive_metastore.default.bucket_parquet_table_1[id#3238L,username#3239,purchase_amount#3240,category#3241,signup_date#3242] Batched: true, Bucketed: true, DataFilters: [isnotnull(id#3238L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[dbfs:/user/hive/warehouse/default/bucket_parquet_table_1], PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:bigint,username:string,purchase_amount:double,category:string,signup_date:timestamp>, SelectedBucketsCount: 6 out of 6
#    +- Sort [id#3248L ASC NULLS FIRST], false, 0
#       +- Filter isnotnull(id#3248L)
#          +- FileScan parquet hive_metastore.default.bucket_parquet_table_2[id#3248L,username#3249,purchase_amount#3250,category#3251,signup_date#3252] Batched: true, Bucketed: true, DataFilters: [isnotnull(id#3248L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[dbfs:/user/hive/warehouse/default/bucket_parquet_table_2], PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:bigint,username:string,purchase_amount:double,category:string,signup_date:timestamp>, SelectedBucketsCount: 6 out of 6

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/bucket_parquet_table_1/

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/bucket_parquet_table_2/

In [0]:
spark.conf.set("spark.databricks.delta.formatCheck.enabled", "false")
display(spark.read.parquet('dbfs:/user/hive/warehouse/default/bucket_parquet_table_1/part-00000-tid-7955972004986354342-491a57ba-3200-4226-b645-45fd6d36fa8b-65-3_00002.c000.snappy.parquet').select('id').distinct())

In [0]:
spark.conf.set("spark.databricks.delta.formatCheck.enabled", "false")
display(spark.read.parquet('dbfs:/user/hive/warehouse/default/bucket_parquet_table_2/part-00000-tid-7249542870130387305-7fd44ef9-9ca8-4e05-b078-4b978aa306fb-69-3_00002.c000.snappy.parquet').select('id').distinct())