In [0]:
spark.sql("""
SELECT *
FROM ecommerce.silver.silver_events
WHERE event_type = 'purchase'
""").explain(True)

== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('event_type = purchase)
   +- 'UnresolvedRelation [ecommerce, silver, silver_events], [], false

== Analyzed Logical Plan ==
event_time: timestamp, event_type: string, product_id: int, category_id: bigint, category_code: string, brand: string, price: double, user_id: int, user_session: string
Project [event_time#13189, event_type#13190, product_id#13191, category_id#13192L, category_code#13193, brand#13194, price#13195, user_id#13196, user_session#13197]
+- Filter (event_type#13190 = purchase)
   +- SubqueryAlias ecommerce.silver.silver_events
      +- Relation ecommerce.silver.silver_events[event_time#13189,event_type#13190,product_id#13191,category_id#13192L,category_code#13193,brand#13194,price#13195,user_id#13196,user_session#13197] parquet

== Optimized Logical Plan ==
Filter (isnotnull(event_type#13190) AND (event_type#13190 = purchase))
+- Relation ecommerce.silver.silver_events[event_time#13189,event_type#13190,product_id#13191

In [0]:
%sql
CREATE TABLE ecommerce.silver.silver_events_part
USING DELTA
PARTITIONED BY (event_date, event_type)
AS
SELECT
  DATE(event_time) AS event_date,
  *
FROM ecommerce.silver.silver_events;

num_affected_rows,num_inserted_rows


In [0]:
%sql
OPTIMIZE ecommerce.silver.silver_events_part;


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 182, null, null, 0, 0, 182, 182, true, 0, 0, 1768715255649, 1768715257211, 8, 0, null, List(0, 0), null, 10, 10, 0, 0, null)"


In [0]:
%sql
OPTIMIZE ecommerce.silver.silver_events_part
ZORDER BY (user_id, product_id);


path,metrics
,"List(13, 4, List(38075521, 95346450, 4.9423521384615384E7, 13, 642505778), List(96484052, 202499967, 1.71385253E8, 4, 685541012), 182, List(minCubeSize(107374182400), List(0, 0), List(182, 3631957718), 0, List(4, 685541012), 4, null), null, 0, 1, 182, 178, false, 0, 0, 1768715285577, 1768715309663, 8, 4, null, List(0, 0), null, 10, 10, 37430, 0, null)"


In [0]:
import time

start = time.time()
spark.sql("""
SELECT *
FROM ecommerce.silver.silver_events
WHERE user_id = 12345
""").count()
print(f"Time before: {time.time() - start:.2f}s")


Time before: 1.01s


In [0]:
start = time.time()
spark.sql("""
SELECT *
FROM ecommerce.silver.silver_events_part
WHERE user_id = 12345
""").count()
print(f"Time after: {time.time() - start:.2f}s")


Time after: 0.76s


In [0]:
cached_df = spark.table("ecommerce.silver.silver_events_part")
cached_df.count()  # materialize cache

109819992

In [0]:
%sql
SELECT COUNT(*) FROM ecommerce.silver.silver_events_part WHERE event_type='purchase';

COUNT(*)
1659703


In [0]:
import time

query_before = """
SELECT
  brand,
  ROUND(SUM(price), 2) AS total_revenue
FROM ecommerce.silver.silver_events
WHERE event_type = 'purchase'
  AND brand IS NOT NULL
GROUP BY brand
ORDER BY total_revenue DESC
LIMIT 10
"""

start = time.time()
spark.sql(query_before).collect()
print(f"Before Optimization Time: {time.time() - start:.2f} seconds")


Before Optimization Time: 2.30 seconds


In [0]:
spark.sql(query_before).explain(True)

== Parsed Logical Plan ==
'GlobalLimit 10
+- 'LocalLimit 10
   +- 'Sort ['total_revenue DESC NULLS LAST], true
      +- 'Aggregate ['brand], ['brand, 'ROUND('SUM('price), 2) AS total_revenue#18564]
         +- 'Filter (('event_type = purchase) AND isnotnull('brand))
            +- 'UnresolvedRelation [ecommerce, silver, silver_events], [], false

== Analyzed Logical Plan ==
brand: string, total_revenue: double
GlobalLimit 10
+- LocalLimit 10
   +- Sort [total_revenue#18564 DESC NULLS LAST], true
      +- Aggregate [brand#18579], [brand#18579, round(sum(price#18580), 2) AS total_revenue#18564]
         +- Filter ((event_type#18575 = purchase) AND isnotnull(brand#18579))
            +- SubqueryAlias ecommerce.silver.silver_events
               +- Relation ecommerce.silver.silver_events[event_time#18574,event_type#18575,product_id#18576,category_id#18577L,category_code#18578,brand#18579,price#18580,user_id#18581,user_session#18582] parquet

== Optimized Logical Plan ==
GlobalLimit 10
+- 

In [0]:
query_after = """
SELECT
  brand,
  ROUND(SUM(price), 2) AS total_revenue
FROM ecommerce.silver.silver_events_part
WHERE event_type = 'purchase'
  AND brand IS NOT NULL
GROUP BY brand
ORDER BY total_revenue DESC
LIMIT 10
"""

start = time.time()
spark.sql(query_after).collect()
print(f"After Optimization Time: {time.time() - start:.2f} seconds")


After Optimization Time: 1.42 seconds


In [0]:
spark.sql(query_after).explain(True)

== Parsed Logical Plan ==
'GlobalLimit 10
+- 'LocalLimit 10
   +- 'Sort ['total_revenue DESC NULLS LAST], true
      +- 'Aggregate ['brand], ['brand, 'ROUND('SUM('price), 2) AS total_revenue#18737]
         +- 'Filter (('event_type = purchase) AND isnotnull('brand))
            +- 'UnresolvedRelation [ecommerce, silver, silver_events_part], [], false

== Analyzed Logical Plan ==
brand: string, total_revenue: double
GlobalLimit 10
+- LocalLimit 10
   +- Sort [total_revenue#18737 DESC NULLS LAST], true
      +- Aggregate [brand#18754], [brand#18754, round(sum(price#18755), 2) AS total_revenue#18737]
         +- Filter ((event_type#18750 = purchase) AND isnotnull(brand#18754))
            +- SubqueryAlias ecommerce.silver.silver_events_part
               +- Relation ecommerce.silver.silver_events_part[event_date#18748,event_time#18749,event_type#18750,product_id#18751,category_id#18752L,category_code#18753,brand#18754,price#18755,user_id#18756,user_session#18757] parquet

== Optimized Lo