### What is Slowly Changing Dimension

![](Images/57/57 SCD - Slowly Changing Dimension.jpg)

### Example

![](Images/57/57 Example Screenshot 1.jpg)

![](Images/57/57 Example Screenshot 2.jpg)



### SCD Demo

In [0]:
%sql
CREATE OR REPLACE TABLE scd2Demo(
  pk1 INT,
  pk2 STRING,
  dim1 INT,
  dim2 INT,
  dim3 INT,
  dim4 INT,
  active_status STRING,
  start_date TIMESTAMP,
  end_date TIMESTAMP
) USING DELTA

In [0]:
%sql
insert into scd2Demo values (111, 'Unit1', 200, 500, 800, 400, 'Y', current_timestamp(), '9999-12-31');
insert into scd2Demo values (222, 'Unit2', 900, Null, 700, 100, 'Y', current_timestamp(), '9999-12-31');
insert into scd2Demo values (333, 'Unit3', 300, 900, 250, 650, 'Y', current_timestamp(), '9999-12-31');

In [0]:
from delta import *
targetTable = DeltaTable.forName(spark, "default.scd2Demo")
targetDF = targetTable.toDF()
display(targetDF)

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

schema = StructType([
    StructField('pk1', IntegerType(), True),
    StructField('pk2', StringType(), True),
    StructField('dim1', IntegerType(), True),
    StructField('dim2', IntegerType(), True),
    StructField('dim3', IntegerType(), True),
    StructField('dim4', IntegerType(), True)
])

In [0]:
data = [
    (111, 'Unit1', 200, 500, 800, 400),
    (222, 'Unit2', 800, 1300, 800, 500),
    (444, 'Unit4', 100, None, 700, 300),
    ]

sourceDF = spark.createDataFrame(data = data, schema = schema)
display(sourceDF)

In [0]:
joinDF = sourceDF.join(targetDF, (sourceDF.pk1 == targetDF.pk1) & \
    (sourceDF.pk2 == targetDF.pk2) & \
    (targetDF.active_status=="Y"), "leftouter") \
    .select(sourceDF["*"], \
        targetDF.pk1.alias('target_pk1'), \
        targetDF.pk2.alias('target_pk2'), \
        targetDF.dim1.alias('target_dim1'), \
        targetDF.dim2.alias('target_dim2'), \
        targetDF.dim3.alias('target_dim3'), \
        targetDF.dim4.alias('target_dim4')
    )
            
display(joinDF)
    

In [0]:
filterDF = joinDF.filter(xxhash64(joinDF.dim1, joinDF.dim2, joinDF.dim3, joinDF.dim4) != xxhash64(joinDF.target_dim1, joinDF.target_dim2, joinDF.target_dim3, joinDF.target_dim4))

display(filterDF)


In [0]:
mergerDF = filterDF.withColumn("MERGEKEY", concat(filterDF.pk1, filterDF.pk2))
display(mergerDF)

In [0]:
dummyDF = filterDF.filter("target_pk1 is not null").withColumn("MERGEKEY", lit(None))

display(dummyDF)

In [0]:
scdDF = mergerDF.union(dummyDF)
display(scdDF)

In [0]:
targetTable.alias("target").merge(
    source=scdDF.alias("source"),
    condition= "concat(target.pk1, target.pk2) = source.MERGEKEY and target.active_status = 'Y' "
    ).whenMatchedUpdate(set = {
        "active_status": "'N'",
        "end_date": "current_date"
        }
    ).whenNotMatchedInsert(values=
                           {
                               "pk1": "source.pk1",
                               "pk2": "source.pk2",
                               "dim1": "source.dim1",
                               "dim2": "source.dim2",
                               "dim3": "source.dim3",
                               "dim4": "source.dim4",
                               "active_status": "'Y'",
                               "start_date": "current_date",
                               "end_date": """to_date('9999-12-31', 'yyyy-MM-dd')"""
                           }
    
    ).execute()
                        


In [0]:
%sql
select * from scd2demo