### Daily Summary Table

In [0]:
from pyspark.sql.types import StringType, IntegerType, DateType, BooleanType
import pyspark.sql.functions as F
from delta.tables import DeltaTable

In [0]:
catalog_name = "ecommerce"

In [0]:
days_cutoff = 30
source_table_name = "gld_fact_order_items"
table_name = "gld_fact_daily_orders_summary"

In [0]:
max_date_row = spark.sql(f"""
    SELECT MAX(transaction_date) AS max_date 
    FROM {catalog_name}.gold.{source_table_name}
""").collect()[0]

max_date = max_date_row['max_date']
print(max_date)

2025-09-02


In [0]:
if spark.catalog.tableExists(f"{catalog_name}.gold.{table_name}"):
    where_clause = f"transaction_date >= date_sub(date('{max_date}'), {days_cutoff})" # max_date
else: 
    where_clause = "1=1"

In [0]:
summary_query = f"""
SELECT
date_id,
unit_price_currency as currency,
SUM(quantity) as total_quantity,
SUM(gross_amount) as total_gross_amount,
SUM(discount_amount) as total_discount_amount,
SUM(tax_amount) as total_tax_amount,
SUM(net_amount) as total_amount
FROM
{catalog_name}.gold.{source_table_name}
WHERE {where_clause}
GROUP BY date_id, currency
Order By date_id Desc
"""
summary_df = spark.sql(summary_query)

In [0]:
summary_df.show(7)

+--------+--------+--------------+------------------+---------------------+----------------+------------+
| date_id|currency|total_quantity|total_gross_amount|total_discount_amount|total_tax_amount|total_amount|
+--------+--------+--------------+------------------+---------------------+----------------+------------+
|20250902|     SGD|           113|           30278.0|                 2580|            3171|     30869.0|
|20250902|     INR|          1243|       1.8592726E7|              1665699|         1868732| 1.8795759E7|
|20250902|     CAD|            82|           23052.0|                 2037|            2692|     23707.0|
|20250902|     USD|           292|           41426.0|                 3140|            4245|     42531.0|
|20250902|     AUD|            81|           26549.0|                 1302|            2310|     27557.0|
|20250902|     AED|           164|           92402.0|                10061|            9417|     91758.0|
|20250902|     GBP|           216|           2

In [0]:

summary_df.select(
    F.min("date_id").alias("min_date"),
    F.max("date_id").alias("max_date")
).show()

+--------+--------+
|min_date|max_date|
+--------+--------+
|20240101|20250902|
+--------+--------+



In [0]:
# This code maintains a daily summary Delta table.
# - On the first run, it creates the table with all historical data.
# - On later runs, it recalculates the last N days (e.g., 30), then merges: updating existing dates and inserting new ones to keep the summary accurate.

if not spark.catalog.tableExists(f"{catalog_name}.gold.{table_name}"):
    summary_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.gold.{table_name}")
    spark.sql(f"ALTER TABLE {catalog_name}.gold.{table_name} CLUSTER BY AUTO;")
else:
    delta_table = DeltaTable.forName(spark, f"{catalog_name}.gold.{table_name}")
    delta_table.alias("gold_table").merge(summary_df.alias("data_snapshot"),"gold_table.date_id = data_snapshot.date_id AND gold_table.currency = data_snapshot.currency").whenMatchedUpdateAll().whenNotMatchedInsertAll().execute() 
     