TABLE 1

In [0]:
from pyspark.sql import SparkSession
import random
from datetime import datetime, timedelta

# Create a Spark session
spark = SparkSession.builder \
    .appName("Large DataFrame Example") \
    .getOrCreate()

# Sample function to generate random rows
def generate_row(i):
    return (
        i,
        f"user_{random.randint(1, 100000)}",
        round(random.uniform(100.0, 1000.0), 2),
        random.choice(["A", "B", "C", "D"]),
        datetime(2020, 1, 1) + timedelta(days=random.randint(0, 365 * 3))
    )

# Generate 1 million rows of data
data = [generate_row(i) for i in range(5_000_000)]

# Define schema
columns = ["id", "username", "purchase_amount", "category", "signup_date"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

In [0]:
df.write.format('parquet').mode("overwrite").save('dbfs:/user/hive/warehouse/default/parquet_table_1')

TABLE 2

In [0]:
from pyspark.sql import SparkSession
import random
from datetime import datetime, timedelta

# Create a Spark session
spark = SparkSession.builder \
    .appName("Large DataFrame Example") \
    .getOrCreate()

# Sample function to generate random rows
def generate_row(i):
    return (
        i,
        f"user_{random.randint(1, 100000)}",
        round(random.uniform(100.0, 1000.0), 2),
        random.choice(["A", "B", "C", "D"]),
        datetime(2020, 1, 1) + timedelta(days=random.randint(0, 365 * 3))
    )

# Generate 1 million rows of data
data = [generate_row(i) for i in range(3_000_000)]

# Define schema
columns = ["id", "username", "purchase_amount", "category", "signup_date"]

# Create DataFrame
df2 = spark.createDataFrame(data, schema=columns)

In [0]:
df2.write.format('parquet').mode("overwrite").save('dbfs:/user/hive/warehouse/default/parquet_table_2')

In [0]:
spark\
    .read.parquet('dbfs:/user/hive/warehouse/default/parquet_table_1')\
    .write.saveAsTable('parquet_table_1')

In [0]:
spark\
    .read.parquet('dbfs:/user/hive/warehouse/default/parquet_table_2')\
    .write.saveAsTable('parquet_table_2')

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/parquet_table_1

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/parquet_table_2/

In [0]:
df = spark.read.table('parquet_table_1')
df2 = spark.read.table('parquet_table_2')

In [0]:
df.join(df2, df.id == df2.id, 'inner').explain()

In [0]:
display(df.join(df2, df.id == df2.id, 'inner'))

Bucket

In [0]:
df.write.format("parquet") \
    .mode("overwrite") \
    .bucketBy(6, "id") \
    .saveAsTable("hive_metastore.default.bucket_parquet_table_1", path="dbfs:/user/hive/warehouse/default/bucket_parquet_table_1")

In [0]:
df2.write.format("parquet") \
    .mode("overwrite") \
    .bucketBy(6, "id") \
    .saveAsTable("hive_metastore.default.bucket_parquet_table_2", path="dbfs:/user/hive/warehouse/default/bucket_parquet_table_2")

In [0]:
bucket_parquet_table_1 = spark.table("hive_metastore.default.bucket_parquet_table_1")
bucket_parquet_table_2 = spark.table("hive_metastore.default.bucket_parquet_table_2")

In [0]:
bucket_parquet_table_1.join(bucket_parquet_table_2, bucket_parquet_table_1.id == bucket_parquet_table_2.id, 'inner').explain()

In [0]:
display(bucket_parquet_table_1.join(bucket_parquet_table_2, bucket_parquet_table_1.id == bucket_parquet_table_2.id, 'inner'))

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/bucket_parquet_table_1/

In [0]:
%fs
ls dbfs:/user/hive/warehouse/default/bucket_parquet_table_2/

In [0]:
spark.conf.set("spark.databricks.delta.formatCheck.enabled", "false")
display(spark.read.parquet('dbfs:/user/hive/warehouse/default/bucket_parquet_table_1/part-00000-tid-7955972004986354342-491a57ba-3200-4226-b645-45fd6d36fa8b-65-3_00002.c000.snappy.parquet').select('id').distinct())

In [0]:
spark.conf.set("spark.databricks.delta.formatCheck.enabled", "false")
display(spark.read.parquet('dbfs:/user/hive/warehouse/default/bucket_parquet_table_2/part-00000-tid-7249542870130387305-7fd44ef9-9ca8-4e05-b078-4b978aa306fb-69-3_00002.c000.snappy.parquet').select('id').distinct())