In [0]:
# Notebook: apply_scd2_location.py

import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

# ============================================================================
# EXTRACT LOCATION DATA FROM SILVER
# ============================================================================

@dlt.table(
    name="location_source",
    comment="Location data for SCD Type 2"
)
def location_source():
    df = dlt.read("crime_silver")
    
    # Extract unique location records
    location_df = df.select(
        col("area"),
        col("area_name"),
        col("lat_clean"),
        col("lon_clean"),
        col("processing_timestamp")
    ).distinct().filter(col("area").isNotNull())
    
    return location_df


In [0]:

dlt.create_streaming_table(
    name="dim_location_scd2_raw",
    comment="SCD Type 2 tracking for location"
)

dlt.apply_changes(
    target="dim_location_scd2_raw",
    source="location_source",
    keys=["area"],
    sequence_by="processing_timestamp",
    ignore_null_updates=True,
    stored_as_scd_type=2  # ‚Üê SCD Type 2 happens HERE
)

In [0]:
# ============================================================================
# CLEAN SCD TYPE 2 OUTPUT - READY FOR SNOWFLAKE
# ============================================================================

@dlt.table(
    name="dim_location_gold",
    comment="Location dimension with SCD Type 2 - READY FOR SNOWFLAKE"
)
def dim_location_gold():
    """
    This table has SCD Type 2 ALREADY APPLIED
    Just load this to Snowflake - no more SCD logic needed!
    """
    df = spark.read.table("workspace.crime.dim_location_scd2_raw")
    
    # Extract SCD Type 2 metadata from DLT columns
    df = df.withColumn("effective_date", 
                       to_date(col("__START_AT")))
    df = df.withColumn("end_date", 
                       to_date(col("__END_AT")))
    df = df.withColumn("is_current", 
                       when(col("end_date").isNull(), True).otherwise(False))
    
    # Generate surrogate key with version
    from pyspark.sql.window import Window
    window_spec = Window.partitionBy("area").orderBy("effective_date")
    df = df.withColumn("version_num", row_number().over(window_spec))
    df = df.withColumn("location_key", 
                       concat(lit("LOC_"), col("area"), lit("_"), col("version_num")))
    
    return df.select(
        "location_key",
        "area",
        "area_name",
        "lat_clean",
        "lon_clean",
        "effective_date",
        "end_date",
        "is_current"
    )