# Slowly Changing Dimension Example using Jinja2

## Prerequisites

1. Install jinja2 library on cluster. Create a requirements.txt file with content below and install library on cluster
 - jinja2

See jinja2 doc for more examples;
 - https://jinja.palletsprojects.com/en/stable/

## The Jinja2 template for handling slowly changing behavior

In [1]:
from jinja2 import Template

scd2_template_update = Template("""

-- Step 1: Expire old records
MERGE INTO {{ target_table }} AS target
USING {{ source_table }} AS source
ON {{ scd_keys }}
AND target.current_flag = true
WHEN MATCHED AND (
    {% for col in tracked_columns %}
        target.{{ col }} != source.{{ col }}{% if not loop.last %} OR {% endif %}
    {% endfor %}
)
THEN UPDATE SET
    current_flag = false,
    effective_end_date = current_date();
""")

scd2_template_insert = Template("""

-- Step 2: Insert new/changed records
INSERT INTO {{ target_table }} (
    {{ insert_columns | join(', ') }},
    effective_start_date,
    effective_end_date,
    current_flag
)
SELECT
    {% for col in insert_columns %}
        source.{{ col }}{% if not loop.last %}, {% endif %}
    {% endfor %},
    current_date(),
    NULL,
    true
FROM {{ source_table }} AS source
LEFT JOIN {{ target_table }} AS target
ON {{ scd_keys }}
AND target.current_flag = true
WHERE
    target.customer_id IS NULL OR
    {% for col in tracked_columns %}
        target.{{ col }} != source.{{ col }}{% if not loop.last %} OR {% endif %}
    {% endfor %};

""")

def run_scd2_merge(source_table, target_table, scd_keys, tracked_columns, insert_columns):
    sql = scd2_template_update.render(
        source_table=source_table,
        target_table=target_table,
        scd_keys=scd_keys,
        tracked_columns=tracked_columns,
        insert_columns=insert_columns
    )
    #print("Executing Update SQL:\n", sql)
    spark.sql(sql)
    sql = scd2_template_insert.render(
        source_table=source_table,
        target_table=target_table,
        scd_keys=scd_keys,
        tracked_columns=tracked_columns,
        insert_columns=insert_columns
    )
    #print("Executing Insert SQL:\n", sql)
    spark.sql(sql)

## Initial data load int empty dim_customer table

In [1]:
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.functions import lit, current_date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, BooleanType

schema = StructType([StructField("customer_id", IntegerType(), True),\
                     StructField("name", StringType(), True),\
                     StructField("email", StringType(), True),\
                     StructField("status", StringType(), True),\
                     StructField("effective_start_date", DateType(), True),\
                     StructField("effective_end_date", DateType(), True),\
                     StructField("current_flag", BooleanType(), True)\
                    ])

spark = SparkSession.builder.getOrCreate()

# Create historical target table
target_df = spark.createDataFrame([
    (1, "Alice", "alice@example.com", "active", datetime(2025, 1, 1), None, True),
    (2, "Bob", "bob@example.com", "active", datetime(2025, 1, 1), None, True),
], schema)
target_df.show()
target_df.write.mode("overwrite").format("delta").saveAsTable("hive.default.dim_customer")


## New data including updates and inserts

In [1]:
# Simulating changes to Alice's status and adding a new customer (Charlie)
schema = StructType([StructField("customer_id", IntegerType(), True),\
                     StructField("name", StringType(), True),\
                     StructField("email", StringType(), True),\
                     StructField("status", StringType(), True)\
                    ])
source_df_changed = spark.createDataFrame([
    (1, "Alice", "alice@example.com", "inactive"),  # status changed
    (2, "Bob", "bob@example.com", "inactive"),      # same as before
    (3, "Charlie", "charlie@example.com", "active"),# new record
], schema)

source_df_changed.createOrReplaceTempView("staging_customer")


## Perform the SCD 2 load

Capture history when the following changes
 - name
 - email
 - status


In [1]:
run_scd2_merge(
    source_table="staging_customer",
    target_table="hive.default.dim_customer",
    scd_keys="target.customer_id = source.customer_id",
    tracked_columns=["name", "email", "status"],
    insert_columns=["customer_id", "name", "email", "status"]
)


In [1]:
spark.sql("select * from hive.default.dim_customer ORDER BY customer_id, effective_start_date").show()