In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import *

**Define paths**

In [0]:
silver_path = "abfss://silver@stretailenvdev.dfs.core.windows.net/s_Customers"
gold_path   = "abfss://gold@stretailenvdev.dfs.core.windows.net/dim_customer"

**Create delta lake dim_customer table if not exists**

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS retail_cata.gold.dim_customer(
    Customer_key BIGINT GENERATED ALWAYS AS IDENTITY(START WITH 1 INCREMENT BY 1),
    CustomerID STRING,
    FirstName STRING,
    LastName STRING,
    Email STRING,
    Phone STRING,
    Address STRING,
    City STRING,
    State STRING,
    Pincode STRING,
    DateOfBirth DATE,
    RegistrationDate TIMESTAMP,
    AgeAtRegistration INT,
    StartDate TIMESTAMP,
    EndDate TIMESTAMP,
    IsActive BOOLEAN
) USING DELTA
LOCATION '{gold_path}'
""")

**Load Silver Data**

In [0]:
silver_df = spark.read.format("delta").load(silver_path)

**Add SCD2 columns**

In [0]:
stg_customer = silver_df.withColumn("AgeAtRegistration",
                                     floor(datediff(col("RegistrationDate"), col("DateOfBirth"))/365.25))\
                        .withColumn("StartDate", current_date()) \
                        .withColumn("EndDate", lit(None).cast("timestamp")) \
                        .withColumn("IsActive", lit(True))

In [0]:
stg_customer.display()

**Load Gold Delta Table**

In [0]:
if DeltaTable.isDeltaTable(spark, gold_path):
    dim_customer = DeltaTable.forPath(spark, gold_path)
else:
    # First-time load 
    stg_customer.write.format("delta").mode("append").save(gold_path)
    dim_customer = DeltaTable.forPath(spark, gold_path)


**Merge for SCD2**

In [0]:
(
    dim_customer.alias("t")
    .merge(
        stg_customer.alias("s"),
        "t.CustomerID = s.CustomerID AND t.IsActive = TRUE"
    )
    .whenMatchedUpdate(
        condition="""
            t.FirstName <> s.FirstName OR
            t.LastName <> s.LastName OR
            t.Email <> s.Email OR
            t.Phone <> s.Phone OR
            t.Address <> s.Address OR
            t.City <> s.City OR
            t.State <> s.State OR
            t.Pincode <> s.Pincode OR
            t.DateOfBirth <> s.DateOfBirth OR
            t.RegistrationDate <> s.RegistrationDate
        """,
        set={
            "EndDate": "current_date()",
            "IsActive": "False"
        }
    )
    .whenNotMatchedInsert(
        values={
            "CustomerID": "s.CustomerID",
            "FirstName": "s.FirstName",
            "LastName": "s.LastName",
            "Email": "s.Email",
            "Phone": "s.Phone",
            "Address": "s.Address",
            "City": "s.City",
            "State": "s.State",
            "Pincode": "s.Pincode",
            "DateOfBirth": "s.DateOfBirth",
            "RegistrationDate": "s.RegistrationDate",
            "AgeAtRegistration": "s.AgeAtRegistration",
            "StartDate": "s.StartDate",
            "EndDate": "s.EndDate",
            "IsActive": "s.IsActive"
        }
    )
    .execute()
)


In [0]:
%sql
SELECT *
FROM retail_cata.gold.dim_customer
ORDER BY Customer_key DESC
LIMIT 10;