In [0]:
# 02_silver_providers_dim.py
# SOURCE : kardia_bronze.bronze_providers  (Bronze appends snapshots; _ingest_ts present; CDF available but unused here)
# OUTPUT : kardia_silver.silver_providers_dim  (SCD‑2 w/ eff_start_ts, eff_end_ts, is_current)
# PATTERN: Simple batch full‑snapshot compare (demo scale). Phase 1 close changed rows; Phase 2 insert new/changed.
# NOTE   : eff_start_ts taken from Bronze _ingest_ts (fallback now()) to avoid artificial history churn.

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

# Table paths
BRONZE_TABLE = "kardia_bronze.bronze_providers"
SILVER_DB    = "kardia_silver"
DIM_TABLE    = f"{SILVER_DB}.silver_providers_dim"

# Columns that trigger a new SCD2 version when values differ (null-safe).
CHANGE_COLS = ["ProviderSpecialty", "ProviderLocation"]

In [0]:
# 1. Ensure DB + Dim table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

if not spark.catalog.tableExists(DIM_TABLE):
    base0 = spark.table(BRONZE_TABLE).limit(0)
    dim0  = (base0
             .withColumn("eff_start_ts", F.current_timestamp())
             .withColumn("eff_end_ts",   F.lit(None).cast("timestamp"))
             .withColumn("is_current",   F.lit(True)))
    dim0.write.format("delta").saveAsTable(DIM_TABLE)

In [0]:
# 2. Latest Bronze snapshot per ProviderID
#    (Bronze appends each seed; pick max _ingest_ts per key.)
bronze_df  = spark.table(BRONZE_TABLE)
has_ingest = "_ingest_ts" in bronze_df.columns

if has_ingest:
    w = Window.partitionBy("ProviderID").orderBy(F.col("_ingest_ts").desc())
    latest_src = (bronze_df
                  .withColumn("_rn", F.row_number().over(w))
                  .filter("_rn = 1")
                  .drop("_rn")
                  .withColumn("eff_start_ts", F.col("_ingest_ts").cast("timestamp")))
else:
    latest_src = bronze_df.withColumn("eff_start_ts", F.current_timestamp())

latest_src = (latest_src
              .withColumn("eff_end_ts", F.lit(None).cast("timestamp"))
              .withColumn("is_current", F.lit(True)))

In [0]:
# 3. Close changed rows (null-safe diffs)
dim = DeltaTable.forName(spark, DIM_TABLE)

change_pred = (
    "NOT (t.ProviderSpecialty <=> s.ProviderSpecialty) OR "
    "NOT (t.ProviderLocation  <=> s.ProviderLocation)"
)

(dim.alias("t")
    .merge(
        latest_src.alias("s"),
        "t.ProviderID = s.ProviderID AND t.is_current = true"
    )
    .whenMatchedUpdate(
        condition=change_pred,
        set={
            "eff_end_ts": F.col("s.eff_start_ts"),  # close at start of new version
            "is_current": F.lit(False)
        }
    )
    .execute())

In [0]:
# 4. Insert new *and* changed rows (no open current match left)
(dim.alias("t")
    .merge(
        latest_src.alias("s"),
        "t.ProviderID = s.ProviderID AND t.is_current = true"
    )
    .whenNotMatchedInsertAll()
    .execute())

In [0]:

# 5. Validate
print(f"Providers dimension refreshed. Rows: {spark.table(DIM_TABLE).count()}")