In [0]:
# silver_providers_scd2_batch.ipynb
# SOURCE: `kardia_bronze.bronze_providers` (daily snapshot with `_ingest_ts`)
# OUTPUT: `kardia_silver.silver_providers` (SCD2 with current flag)
# PATTERN: Compare latest snapshot to current; MERGE by closing changed rows and inserting new versions
# TRIGGER: Incremental batch (full snapshot; CDF not required)

# Install kflow from local wheel for use during job execution
%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

from kflow.config import bronze_table, silver_paths

# Load Silver config for Providers dataset
S = silver_paths("providers")
SRC_TABLE = bronze_table("providers")
TGT_TABLE = S.table

# Business change logic: triggers a new version if specialty or location changes
PROVIDER_CHANGE_CONDITION = (
    "NOT (t.provider_specialty <=> s.provider_specialty) OR "
    "NOT (t.provider_location  <=> s.provider_location)"
)

In [0]:
# 1. Ensure Silver DB and Providers table exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        provider_id         STRING  NOT NULL,
        provider_specialty  STRING,
        provider_location   STRING,
        eff_start_ts        TIMESTAMP,
        eff_end_ts          TIMESTAMP,
        is_current          BOOLEAN
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [0]:
# 2. Load the latest snapshot from Bronze and prepare for SCD2
bronze = (
    spark.table(SRC_TABLE)
         .filter(F.col("ProviderID").isNotNull())
)

w_latest_per_provider = Window.partitionBy("ProviderID").orderBy(F.col("_ingest_ts").desc())

# Retain only the latest record per provider_id from Bronze snapshot
latest_src = (
    bronze
      .withColumn("_rn", F.row_number().over(w_latest_per_provider))
      .filter("_rn = 1")
      .drop("_rn")
      .select(
          F.col("ProviderID").alias("provider_id"),
          F.col("ProviderSpecialty").alias("provider_specialty"),
          F.col("ProviderLocation").alias("provider_location"),
          F.col("_ingest_ts")
      )
      .withColumn("eff_start_ts", F.col("_ingest_ts"))
      .withColumn("eff_end_ts",   F.lit(None).cast("timestamp"))
      .withColumn("is_current",   F.lit(True))
)

In [0]:
# 3. Apply SCD‑2 MERGE to Silver Providers table
(DeltaTable.forName(spark, TGT_TABLE)
           .alias("t")
           .merge(
               latest_src.alias("s"),
               "t.provider_id = s.provider_id AND t.is_current = TRUE"
           )

    # Phase 1: Close the old version if key fields changed
    .whenMatchedUpdate(
        condition=PROVIDER_CHANGE_CONDITION,
        set={
            "eff_end_ts": F.col("s.eff_start_ts"),
            "is_current": F.lit(False)
        }
    )

    # Phase 2: Insert new or changed records
    .whenNotMatchedInsertAll()
    .execute())

In [0]:
# 3. Verify Silver Providers SCD2 output.
df = spark.table(TGT_TABLE)
print(f"Silver Providers row count: {df.count():,}")
display(df.orderBy(F.col("eff_start_ts").desc()).limit(5))