In [0]:
#Loading Silver
from pyspark.sql import functions as F
spark.sql("USE retail_lakehouse")
silver = spark.table("retail_lakehouse.silver_transactions_clean")
print("Silver clean rows:", silver.count())
silver_enriched = (
    silver
    .withColumn("transaction_date", F.to_date("transaction_ts"))
    .withColumn("revenue", F.col("qty") * F.col("unit_price"))
)
display(
    silver_enriched.select(
        "transaction_id",
        "transaction_ts",
        "transaction_date",
        "qty",
        "unit_price",
        "revenue"
    ).limit(10)
)

Silver clean rows: 485252


transaction_id,transaction_ts,transaction_date,qty,unit_price,revenue
TXN_000000000010,2026-01-09T08:38:48.481383Z,2026-01-09,3,101.49,304.47
TXN_000000000013,2026-01-07T15:54:07.481383Z,2026-01-07,1,39.9,39.9
TXN_000000000015,2026-01-06T12:21:24.481383Z,2026-01-06,3,192.61,577.83
TXN_000000000019,2025-12-31T09:11:00.481383Z,2025-12-31,4,204.37,817.48
TXN_000000000021,2025-12-27T22:38:36.481383Z,2025-12-27,3,110.65,331.95000000000005
TXN_000000000022,2026-01-01T04:01:40.481383Z,2026-01-01,5,139.37,696.85
TXN_000000000024,2025-12-29T13:22:10.481383Z,2025-12-29,5,34.81,174.05
TXN_000000000027,2026-01-02T20:12:35.481383Z,2026-01-02,4,47.51,190.04
TXN_000000000038,2025-12-16T03:39:38.481383Z,2025-12-16,2,67.59,135.18
TXN_000000000046,2025-12-30T02:58:04.481383Z,2025-12-30,4,218.3,873.2


In [0]:
#Daily Sales KPIs
gold_sales_daily = (
    silver_enriched
    .groupBy("transaction_date")
    .agg(
        F.countDistinct("transaction_id").alias("orders"),
        F.sum("qty").alias("units_sold"),
        F.sum("revenue").alias("total_revenue"),
        F.avg("revenue").alias("avg_order_value")
    )
    .orderBy("transaction_date")
)

display(gold_sales_daily.limit(20))

transaction_date,orders,units_sold,total_revenue,avg_order_value
2025-01-11,15,47,7058.780000000001,470.5853333333334
2025-01-12,13,39,4390.7,337.74615384615385
2025-01-13,16,56,5565.68,347.855
2025-01-14,26,74,7741.44,297.7476923076923
2025-01-15,15,46,7045.87,469.7246666666667
2025-01-16,13,41,5768.81,443.75461538461536
2025-01-17,16,48,5093.58,318.34875
2025-01-18,23,78,8736.02,379.82695652173913
2025-01-19,21,65,8635.22,411.20095238095234
2025-01-20,24,67,9402.5,391.7708333333333


In [0]:
#Writing Gold 1 Table
(
    gold_sales_daily
    .write
    .format("delta")
    .mode("overwrite")   # first run only
    .saveAsTable("retail_lakehouse.gold_sales_daily")
)

print("✅ Written: gold_sales_daily")

✅ Written: gold_sales_daily


In [0]:
#Daily Sales by Store
gold_sales_by_store_daily = (
    silver_enriched
    .groupBy("transaction_date", "store_id")
    .agg(
        F.countDistinct("transaction_id").alias("orders"),
        F.sum("qty").alias("units_sold"),
        F.sum("revenue").alias("total_revenue"),
        F.avg("revenue").alias("avg_order_value")
    )
)

display(gold_sales_by_store_daily.limit(20))

transaction_date,store_id,orders,units_sold,total_revenue,avg_order_value
2026-01-06,S_0018,304,928,113713.46,374.0574342105263
2025-12-25,S_0047,326,945,114501.77999999998,351.23245398773
2025-08-26,S_0007,2,3,383.25,191.625
2025-07-30,S_0045,1,1,180.96,180.96
2025-04-01,S_0026,1,1,169.68,169.68
2025-08-06,S_0009,2,7,1389.79,694.895
2025-08-08,S_0005,1,3,354.09000000000003,354.09000000000003
2025-09-03,S_0019,2,6,545.08,272.54
2025-12-28,S_0005,330,1023,130733.7,396.1627272727273
2025-12-23,S_0033,331,985,116360.87999999996,351.5434441087612


In [0]:
#Writing Gold 2 Table
(
    gold_sales_by_store_daily
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("retail_lakehouse.gold_sales_by_store_daily")
)

print("✅ Written: gold_sales_by_store_daily")

✅ Written: gold_sales_by_store_daily


In [0]:
#Daily Sales by Product
gold_sales_by_product_daily = (
    silver_enriched
    .groupBy("transaction_date", "product_id")
    .agg(
        F.countDistinct("transaction_id").alias("orders"),
        F.sum("qty").alias("units_sold"),
        F.sum("revenue").alias("total_revenue"),
        F.avg("revenue").alias("avg_order_value")
    )
)

display(gold_sales_by_product_daily.limit(20))

transaction_date,product_id,orders,units_sold,total_revenue,avg_order_value
2026-01-05,P_001810,8,17,3127.5,390.9375
2026-01-07,P_001097,10,31,4283.36,428.336
2025-12-26,P_000849,8,23,3624.4,453.05
2025-12-13,P_001971,5,13,1854.86,370.972
2026-01-02,P_000220,15,46,4455.78,297.052
2025-12-13,P_000519,10,26,1401.43,140.143
2025-12-20,P_001344,6,20,2455.29,409.215
2025-12-14,P_000416,15,37,5169.59,344.6393333333333
2025-12-16,P_001003,14,40,4334.39,309.5992857142857
2026-01-06,P_001247,13,35,4680.47,360.03615384615387


In [0]:
#Writing Gold 3 Table
(
    gold_sales_by_product_daily
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("retail_lakehouse.gold_sales_by_product_daily")
)

print("✅ Written: gold_sales_by_product_daily")

✅ Written: gold_sales_by_product_daily


In [0]:
#Validating Gold Layer
print("gold_sales_daily rows:",
      spark.table("retail_lakehouse.gold_sales_daily").count())

print("gold_sales_by_store_daily rows:",
      spark.table("retail_lakehouse.gold_sales_by_store_daily").count())

print("gold_sales_by_product_daily rows:",
      spark.table("retail_lakehouse.gold_sales_by_product_daily").count())


gold_sales_daily rows: 278
gold_sales_by_store_daily rows: 5664
gold_sales_by_product_daily rows: 65760
