In [None]:
# 02_silver_providers_scd2_batch.ipynb
# SOURCE: kardia_bronze.bronze_providers (daily snapshot w/ _ingest_ts)
# OUTPUT: kardia_silver.silver_providers_dim (Type‑2 history, current‑flag)
# PATTERN: Snapshot compare ➜ MERGE (close‑old / insert‑new)
# NOTE: CDF not required; dataset is small and arrives as complete snapshots.

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

SILVER_DB = "kardia_silver"
BRONZE_PROVIDERS_TBL = "kardia_bronze.bronze_providers"
SILVER_PROVIDERS_TBL = f"{SILVER_DB}.silver_providers_dim"

# Detect change in business columns; triggers a new dimension version
PROVIDER_CHANGE_CONDITION = (
    "NOT (t.ProviderSpecialty <=> s.ProviderSpecialty) OR "
    "NOT (t.ProviderLocation  <=> s.ProviderLocation)"
)

In [None]:
# 1. Ensure DB + Silver Providers table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_PROVIDERS_TBL} (
        ProviderID        STRING,
        ProviderSpecialty STRING,
        ProviderLocation  STRING,
        eff_start_ts      TIMESTAMP,
        eff_end_ts        TIMESTAMP,
        is_current        BOOLEAN,
        CONSTRAINT providerid_nn CHECK (ProviderID IS NOT NULL)
    ) USING DELTA
    """
)

In [None]:
# 2. Build latest snapshot from Bronze
bronze = spark.table(BRONZE_PROVIDERS_TBL)
w_latest = Window.partitionBy("ProviderID").orderBy(F.col("_ingest_ts").desc())

# Retain only the latest record per ProviderID from Bronze snapshot
latest_src = (
    bronze.withColumn("_rn", F.row_number().over(w_latest))
          .filter("_rn = 1")
          .drop("_rn")
          .withColumn("eff_start_ts", F.col("_ingest_ts").cast("timestamp"))
          .withColumn("eff_end_ts", F.lit(None).cast("timestamp"))
          .withColumn("is_current", F.lit(True))
)

In [None]:
# 3. Apply SCD‑2 upsert to Silver dimension table
dim = DeltaTable.forName(spark, SILVER_PROVIDERS_TBL)

(dim.alias("t")
    .merge(
        latest_src.alias("s"),
        "t.ProviderID = s.ProviderID AND t.is_current = true"
    )
    # Phase 1: Close the old version if key fields have changed
    .whenMatchedUpdate(
        condition=PROVIDER_CHANGE_CONDITION,
        set={
            "eff_end_ts": F.col("s.eff_start_ts"),
            "is_current": F.lit(False)
        }
    )
    # Phase 2: Insert brand-new or changed versions
    .whenNotMatchedInsertAll()
    .execute())

print(f"Silver providers row count: {spark.table(SILVER_PROVIDERS_TBL).count()}")