In [None]:
from pyspark.sql.functions import col

# Load data to the dataframe as a starting point to create the gold layer
product = spark.read.table("silver_adventureworks.product").where(col("current") == 1)
product = product.dropDuplicates(["ProductID"])
product = product[["ProductID", "Name", "ProductNumber", "Color", "Size", "Weight", "ProductCategoryID", "ProductModelID"]]
productcategory = spark.read.table("silver_adventureworks.productcategory").where(col("current") == 1)
productcategory = productcategory.dropDuplicates(["ProductCategoryID"])
productcategory = productcategory[["ProductCategoryID", "Name"]]
productcategory = productcategory.withColumnRenamed("Name", "CategoryName")
productmodel = spark.read.table("silver_adventureworks.productmodel").where(col("current") == 1)
productmodel = productmodel.dropDuplicates(["ProductModelID"])
productmodel = productmodel[["ProductModelID", "Name", "CatalogDescription"]]
productmodel = productmodel.withColumnRenamed("Name", "ProductModelName")

# Perform the joins
join1 = product.join(productcategory, product['ProductCategoryID'] == productcategory['ProductCategoryID'], "left")
join2 = join1.join(productmodel, join1['ProductModelID'] == productmodel['ProductModelID'], "left")

dimension_product = join2[["ProductID", "Name", "ProductNumber", "Color", "Size", "Weight" , "CategoryName" , "ProductModelName"]]

# Add hash code using all selected columns
from pyspark.sql.functions import sha2, concat_ws
dimension_product = dimension_product.withColumn("ID", sha2(concat_ws("||", *dimension_product.columns), 256))

In [None]:
from pyspark.sql.types import *
from delta.tables import*
    
 # Define the schema for the dimension_product table
DeltaTable.createIfNotExists(spark) \
    .tableName("gold_adventureworks.dimension_product") \
    .addColumn("ID", StringType()) \
    .addColumn("ProductID", IntegerType()) \
    .addColumn("ProductNumber", StringType()) \
    .addColumn("Color", StringType()) \
    .addColumn("Size", StringType()) \
    .addColumn("Weight", StringType()) \
    .addColumn("CategoryName", StringType()) \
    .addColumn("ProductModelName", StringType()) \
    .execute()

In [None]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, 'Tables/gold_adventureworks/dimension_product')

deltaTable.alias('silver') \
  .merge(
    dimension_product.alias('updates'),
    'silver.ID = updates.ID'
  ) \
   .whenMatchedUpdate(set =
    {
         
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "ID": "updates.ID",
      "ProductID": "updates.ProductID",
      "ProductNumber": "updates.ProductNumber",
      "Color": "updates.Color",
      "Size": "updates.Size",
      "Weight": "updates.Weight",
      "CategoryName": "CategoryName",
      "ProductModelName": "ProductModelName"
    }
   ) \
   .execute()

In [None]:
df = spark.sql("SELECT * FROM Gold.gold_adventureworks.dimension_product LIMIT 1000")
display(df)