In [None]:
from pyspark.sql.functions import col

# Load data to the dataframe as a starting point to create the gold layer
customer = spark.read.table("silver_adventureworks.customer").where(col("current") == 1)
customer = customer.dropDuplicates(["CustomerID"])
customer = customer[["CustomerID", "Title", "FirstName", "MiddleName", "LastName", "CompanyName", "EmailAddress", "Phone"]]
customeraddress = spark.read.table("silver_adventureworks.customeraddress").where(col("current") == 1)
customeraddress = customeraddress.dropDuplicates(["CustomerID", "AddressID"])
customeraddress = customeraddress[["CustomerID", "AddressID", "AddressType"]]
customeraddress = customeraddress.withColumnRenamed("CustomerID", "CustomerID2")
customeraddress = customeraddress.withColumnRenamed("Name", "CategoryName")
address = spark.read.table("silver_adventureworks.address").where(col("current") == 1)
address = address.dropDuplicates(["AddressID"])
address = address[["AddressID", "AddressLine1", "AddressLine2", "City", "StateProvince", "CountryRegion"]]

# Perform the joins
join1 = customer.join(customeraddress, customer['CustomerID'] == customeraddress['CustomerID2'], "left")
join2 = join1.join(address, join1['AddressID'] == address['AddressID'], "left")

dimension_customer = join2[["CustomerID", "Title", "FirstName", "MiddleName", "LastName", "CompanyName", "EmailAddress", "Phone", "AddressLine1", "AddressLine2", "City", "StateProvince", "CountryRegion"]]

# Add hash code using all selected columns
from pyspark.sql.functions import sha2, concat_ws
dimension_customer = dimension_customer.withColumn("ID", sha2(concat_ws("||", *dimension_customer.columns), 256))

In [None]:
from pyspark.sql.types import *
from delta.tables import*
    
 # Define the schema for the dimension_customer table
DeltaTable.createIfNotExists(spark) \
    .tableName("gold_adventureworks.dimension_customer") \
    .addColumn("ID", StringType()) \
    .addColumn("CustomerID", IntegerType()) \
    .addColumn("Title", StringType()) \
    .addColumn("FirstName", StringType()) \
    .addColumn("MiddleName", StringType()) \
    .addColumn("LastName", StringType()) \
    .addColumn("CompanyName", StringType()) \
    .addColumn("EmailAddress", StringType()) \
    .addColumn("Phone", StringType()) \
    .addColumn("AddressLine1", StringType()) \
    .addColumn("AddressLine2", StringType()) \
    .addColumn("City", StringType()) \
    .addColumn("StateProvince", StringType()) \
    .addColumn("CountryRegion", StringType()) \
    .execute()

In [None]:
from delta.tables import *;
    
deltaTable = DeltaTable.forPath(spark, 'Tables/gold_adventureworks/dimension_customer')
    
deltaTable.alias('silver') \
  .merge(
    dimension_customer.alias('updates'),
    'silver.ID = updates.ID'
  ) \
   .whenMatchedUpdate(set =
    {
         
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "ID": "updates.ID",
      "CustomerID": "updates.CustomerID",
      "Title": "updates.Title",
      "FirstName": "updates.FirstName",
      "MiddleName": "updates.MiddleName",
      "LastName": "updates.LastName",
      "CompanyName": "CompanyName",
      "EmailAddress": "EmailAddress",
      "Phone": "Phone",
      "AddressLine1": "AddressLine1",
      "AddressLine2": "AddressLine2",
      "City": "City",
      "StateProvince": "StateProvince",
      "CountryRegion": "CountryRegion"
    }
   ) \
   .execute()