Import Required Libraries and Spark SQL Functions

In [0]:
import os
from pyspark.sql.functions import *

Define Gold and Silver Layer Base Paths

In [0]:
silver_base = "/Volumes/adventure_works_lakehouse/adventure_works/lakehouse/silver"
gold_base   = "/Volumes/adventure_works_lakehouse/adventure_works/lakehouse/gold"

Read Silver Sales Data

In [0]:
df_sales = spark.read.format("delta").load(f"{silver_base}/silver_adventureworks_sales")

Read Product Dimension Data from Gold Layer

In [0]:
df_dim_product = spark.read.format("delta").load(f"{gold_base}/dim_product")

Transform Sales Data for Fact Table

In [0]:
df_fact_sales = (
    df_sales
    .join(df_dim_product, "ProductKey", "left")
    .withColumn("Revenue", round((col("OrderQuantity") * col("ProductPrice")), 4))
    .withColumn("Cost", round((col("OrderQuantity") * col("ProductCost")),4))
    .withColumn("Profit", round((col("Revenue") - col("Cost")),4))
    .withColumn("DateKey", date_format(col("OrderDate"), "yyyyMMdd").cast("int"))
    .select(
        "DateKey",
        "OrderNumber",
        "CustomerKey",
        "ProductKey",
        "TerritoryKey",
        "OrderQuantity",
        "Revenue",
        "Cost",
        "Profit"
    )
)

In [0]:
df_fact_sales.limit(7).display()

DateKey,OrderNumber,CustomerKey,ProductKey,TerritoryKey,OrderQuantity,Revenue,Cost,Profit
20150101,TO45080,14657,332,1,1,699.0982,413.1463,285.9519
20150101,TO45079,29255,312,4,1,3578.27,2171.2942,1406.9758
20150101,TO45082,11455,350,9,1,3374.99,1898.0944,1476.8956
20150101,TO45081,26782,338,6,1,699.0982,413.1463,285.9519
20150102,TO45083,14947,312,10,1,3578.27,2171.2942,1406.9758
20150102,TO45084,29143,310,4,1,3578.27,2171.2942,1406.9758
20150102,TO45086,18747,314,9,1,3578.27,2171.2942,1406.9758


Write Sales Fact Table to Gold Layer


In [0]:
df_fact_sales.write.format("delta")\
        .mode("overwrite")\
        .save(f"{gold_base}/fact_sales")