Import Required Libraries and Spark SQL Functions

In [0]:
import os
from pyspark.sql.functions import *

Define Gold and Silver Layer Base Paths

In [0]:
silver_base = "/Volumes/adventure_works_lakehouse/adventure_works/lakehouse/silver"
gold_base   = "/Volumes/adventure_works_lakehouse/adventure_works/lakehouse/gold"

Read Silver Product, Subcategory, and Category Data

In [0]:
df_products = spark.read.format("delta").load(f"{silver_base}/silver_adventureworks_products")


df_subcats = spark.read.format("delta").load(f"{silver_base}/silver_adventureworks_product_subcategories")


df_cats = spark.read.format("delta").load(f"{silver_base}/silver_adventureworks_product_categories")

Transform and Prepare Product Dimension Data (join products + subcategories + categories)

In [0]:
df_dim_product = (df_products
        .join(df_subcats, "ProductSubcategoryKey", "left")
        .join(df_cats, "ProductCategoryKey", "left")
        .withColumn("Margin", round((col("ProductPrice") - col("ProductCost")), 4))
        .select(
            "ProductKey",
            "ProductName",
            "CategoryName",
            "SubcategoryName",
            "ProductColor",
            "ProductSize",
            "ProductStyle",
            "ProductCost",
            "ProductPrice",
            "Margin"
        )
    )

In [0]:
df_dim_product.limit(7).display()

ProductKey,ProductName,CategoryName,SubcategoryName,ProductColor,ProductSize,ProductStyle,ProductCost,ProductPrice,Margin
214,Sport-100,Accessories,Helmets,Red,0,0,13.0863,34.99,21.9037
215,Sport-100,Accessories,Helmets,Black,0,0,12.0278,33.6442,21.6164
218,Mountain,Clothing,Socks,White,M,U,3.3963,9.5,6.1037
219,Mountain,Clothing,Socks,White,L,U,3.3963,9.5,6.1037
220,Sport-100,Accessories,Helmets,Blue,0,0,12.0278,33.6442,21.6164
223,AWC,Clothing,Caps,Multi,0,U,5.7052,8.6442,2.939
226,Long-Sleeve,Clothing,Jerseys,Multi,S,U,31.7244,48.0673,16.3429


Write Data into Gold Layer

In [0]:
df_dim_product.write.format("delta")\
            .mode("overwrite")\
            .save(f"{gold_base}/dim_product")
