In [None]:
# Set up the session for V-Order writing
"spark.sql.parquet.vorder.enabled", "true"
"spark.microsoft.delta.optimizeWrite.enabled", "true"
"spark.microsoft.delta.optimizeWrite.binSize", "1073741824"

In [None]:
from pyspark.sql.functions import *

# Load data to the dataframe as a starting point to create the gold layer
customer = spark.read.table("silver.adventureworks.hist_customer") \
.where(col("current") == True)
customer = customer.dropDuplicates(["CustomerID"])

# Select only the relevant columns
dimension_customer = customer[["CustomerID", "Title", "FirstName", \
"MiddleName", "LastName", "CompanyName", "EmailAddress", "Phone"]]

# Add hash code using all selected columns
dimension_customer = dimension_customer.withColumn("ID", \
sha2(concat_ws("||", *dimension_customer.columns), 256))

In [None]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, \
'Tables/adventureworks/dimension_customer')

deltaTable.alias('gold') \
  .merge(
    dimension_customer.alias('updates'),
    'gold.ID = updates.ID'
  ).whenMatchedUpdate(set =
    {
      "current_flag": lit("1"),
      "current_date": current_date(),
      "end_date": """to_date('9999-12-31', 'yyyy-MM-dd')"""
    }
  ).whenNotMatchedInsert(values =
    {
      "ID": "updates.ID",
      "CustomerID": "updates.CustomerID",
      "Title": "updates.Title",
      "FirstName": "updates.FirstName",
      "MiddleName": "updates.MiddleName",
      "LastName": "updates.LastName",
      "CompanyName": "updates.CompanyName",
      "EmailAddress": "updates.EmailAddress",
      "Phone": "updates.Phone",
      "current_flag": lit("1"),
      "current_date": current_date(),
      "end_date": """to_date('9999-12-31', 'yyyy-MM-dd')"""
    }
  ).whenNotMatchedBySourceUpdate(set =
    {
      "current_flag": lit("0"),
      "end_date": current_date()
    }
  ).execute()