In [1]:
# Set up the session for V-Order writing
"spark.sql.parquet.vorder.enabled", "true"
"spark.microsoft.delta.optimizeWrite.enabled", "true"
"spark.microsoft.delta.optimizeWrite.binSize", "1073741824"

StatementMeta(, 1aba073e-b79f-4ca7-a370-280e2f66f6f8, 3, Finished, Available, Finished)

('spark.microsoft.delta.optimizeWrite.binSize', '1073741824')

In [2]:
from pyspark.sql.functions import *

# Load data to the dataframes
product = spark.read.table("silver.adventureworks.hist_product") \
.where(col("current") == True)
product = product.dropDuplicates(["ProductID"])
product = product[["ProductID", "Name", "ProductNumber", \
"Color", "Size", "Weight", "ProductCategoryID", "ProductModelID"]]
category = spark.read.table("silver.adventureworks.hist_productcategory") \
.where(col("current") == True)
category = category.dropDuplicates(["ProductCategoryID"])
category = category[["ProductCategoryID", "Name"]]
category = category.withColumnRenamed("Name", "CategoryName")
model = spark.read.table("silver.adventureworks.hist_productmodel") \
.where(col("current") == True)
model = model.dropDuplicates(["ProductModelID"])
model = model[["ProductModelID", "Name", "CatalogDescription"]]
model = model.withColumnRenamed("Name", "ProductModelName")

# Perform the joins
dimension_product = product.join(category, on="ProductCategoryID", how="left")
dimension_product = dimension_product.join(model, on="ProductModelID", how="left")

# Select only the relevant columns
dimension_product = dimension_product[["ProductID", "Name", "ProductNumber", \
"Color", "Size", "Weight" , "CategoryName" , "ProductModelName"]]

# Add hash code using all selected columns
dimension_product = dimension_product.withColumn("ID", \
sha2(concat_ws("||", *dimension_product.columns), 256))

StatementMeta(, 1aba073e-b79f-4ca7-a370-280e2f66f6f8, 4, Finished, Available, Finished)

In [3]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, \
'Tables/adventureworks/dimension_product')
       
deltaTable.alias('silver') \
  .merge(
    dimension_product.alias('updates'),
    'silver.ID = updates.ID'
  ).whenMatchedUpdate(set =
    {
      "current_flag": lit("1"),
      "current_date": current_date(),
      "end_date": """to_date('9999-12-31', 'yyyy-MM-dd')"""
    }
  ).whenNotMatchedInsert(values =
    {
      "ID": "updates.ID",
      "ProductID": "updates.ProductID",
      "ProductNumber": "updates.ProductNumber",
      "Color": "updates.Color",
      "Size": "updates.Size",
      "Weight": "updates.Weight",
      "CategoryName": "updates.CategoryName",
      "ProductModelName": "updates.ProductModelName",
      "current_flag": lit("1"),
      "current_date": current_date(),
      "end_date": """to_date('9999-12-31', 'yyyy-MM-dd')"""
    }
  ).whenNotMatchedBySourceUpdate(set =
    {
      "current_flag": lit("0"),
      "end_date": current_date()
    }
  ).execute()

StatementMeta(, 1aba073e-b79f-4ca7-a370-280e2f66f6f8, 5, Finished, Available, Finished)