In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date

spark = SparkSession.builder.getOrCreate()

data = [
    (1, 101, 1001, "purchase", "2024-01-01", 500),
    (2, 102, 1002, "view",     "2024-01-01", 300),
    (3, 101, 1003, "purchase", "2024-01-02", 700),
    (4, 103, 1001, "cart",     "2024-01-02", 200),
    (5, 104, 1002, "purchase", "2024-01-03", 900),
    (6, 101, 1001, "view",     "2024-01-03", 150),
    (7, 105, 1003, "purchase", "2024-01-04", 1200),
    (8, 102, 1001, "purchase", "2024-01-04", 650)
]

columns = [
    "event_id",
    "user_id",
    "product_id",
    "event_type",
    "event_date",
    "price"
]

df = spark.createDataFrame(data, columns) \
          .withColumn("event_date", to_date(col("event_date")))


In [0]:
df.display()

In [0]:
df.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .saveAsTable("ecommerce.silver.events")


In [0]:
%sql
CREATE TABLE ecommerce.silver.events_part
USING DELTA
PARTITIONED BY (event_date, event_type)
AS
SELECT * FROM ecommerce.silver.events;


In [0]:
%sql
OPTIMIZE ecommerce.silver.events_part  
ZORDER BY (user_id, product_id); 


In [0]:
import time

start = time.time()
spark.sql("""
SELECT *
FROM ecommerce.silver.events
WHERE user_id = 101
""").count()
print("Before Time:", time.time() - start)


In [0]:
start = time.time()
spark.sql("""
SELECT *
FROM ecommerce.silver.events_part
WHERE user_id = 101
""").count()
print("After Time:", time.time() - start)
