In [0]:
# 02_silver_providers_transform.py
# SOURCE : kardia_bronze.bronze_providers  (Bronze appends snapshots; _ingest_ts present; CDF available but unused here)
# OUTPUT : kardia_silver.silver_providers_dim  (SCD‑2 w/ eff_start_ts, eff_end_ts, is_current)
# PATTERN: Simple batch full‑snapshot compare (demo scale). Phase 1 close changed rows; Phase 2 insert new/changed.
# NOTE   : eff_start_ts taken from Bronze _ingest_ts (fallback now()) to avoid artificial history churn.

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

# Table paths
BRONZE_TABLE = "kardia_bronze.bronze_providers"
SILVER_DB    = "kardia_silver"
DIM_TABLE    = f"{SILVER_DB}.silver_providers_dim"

In [0]:
# 1. Ensure DB + Dim table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

# Create empty dimension table on first run
if not spark.catalog.tableExists(DIM_TABLE):
    empty = spark.table(BRONZE_TABLE).limit(0)
    (empty.withColumn("eff_start_ts", F.current_timestamp())
          .withColumn("eff_end_ts",   F.lit(None).cast("timestamp"))
          .withColumn("is_current",   F.lit(True))
          .write.format("delta").saveAsTable(DIM_TABLE))

In [0]:
# 2. Latest snapshot per ProviderID from Bronze
bronze = spark.table(BRONZE_TABLE)
w      = Window.partitionBy("ProviderID").orderBy(F.col("_ingest_ts").desc())

latest_src = (bronze
              .withColumn("_rn", F.row_number().over(w))
              .filter("_rn = 1")
              .drop("_rn")
              .withColumn("eff_start_ts", F.col("_ingest_ts").cast("timestamp"))
              .withColumn("eff_end_ts",   F.lit(None).cast("timestamp"))
              .withColumn("is_current",   F.lit(True)))

In [0]:
# 3. One‑shot MERGE (close & insert)
dim = DeltaTable.forName(spark, DIM_TABLE)

change_pred = (
    "NOT (t.ProviderSpecialty <=> s.ProviderSpecialty) OR "
    "NOT (t.ProviderLocation  <=> s.ProviderLocation)"
)

(dim.alias("t")
    .merge(
        latest_src.alias("s"),
        "t.ProviderID = s.ProviderID AND t.is_current = true"
    )
    # 1 ▸ Close the existing version when a change is detected
    .whenMatchedUpdate(
        condition=change_pred,
        set={
            "eff_end_ts": F.col("s.eff_start_ts"),
            "is_current": F.lit(False)
        }
    )
    # 2 ▸ Insert brand‑new providers OR new versions of changed rows
    .whenNotMatchedInsertAll()
    .execute())

print(f"SCD‑2 dimension refreshed: {DIM_TABLE}.  Row count: {spark.table(DIM_TABLE).count()}")