In [0]:
# silver_providers_scd2_batch.ipynb
# SOURCE: `kardia_bronze.bronze_providers` (snapshot-style input with _ingest_ts)
# TARGET: `kardia_silver.silver_providers` (SCD Type 2 with is_current flag)
# PATTERN: Identify changed rows, close current version and insert new version
# TRIGGER: Incremental batch

# Uncomment the next line if running the notebook as a Job.
# %pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow

from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import bronze_table, silver_paths

# Configure Spark with ADLS OAuth credentials and return base ABFS path
abfss_base = ensure_adls_oauth()

# Set catalog to Hive Metastore (required when not using Unity Catalog)
spark.sql("USE CATALOG hive_metastore")

# Load table paths and names for the Providers dataset
S         = silver_paths("providers")
SRC_TABLE = bronze_table("providers")
TGT_TABLE = S.table

# Define what constitutes a meaningful change for triggering a new version
PROVIDER_CHANGE_CONDITION = (
    "NOT (t.provider_specialty <=> s.provider_specialty) OR "
    "NOT (t.provider_location  <=> s.provider_location)"
)

In [0]:
# 1. Ensure Silver DB and Providers table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        provider_id         STRING  NOT NULL,
        provider_specialty  STRING,
        provider_location   STRING,
        eff_start_ts        TIMESTAMP,
        eff_end_ts          TIMESTAMP,
        is_current          BOOLEAN,
        _ingest_ts          TIMESTAMP,
        _batch_id           STRING,
        _source_file        STRING
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [0]:
# 2. Load and prepare the latest snapshot from Bronze
bronze = (
    spark.table(SRC_TABLE)
         .filter(F.col("ProviderID").isNotNull())
)

w_latest = (
    Window
        .partitionBy("ProviderID")
        .orderBy(
            F.col("_ingest_ts")
             .desc_nulls_last()
        )
)

# Identify the most recent record per provider_id using ingest timestamp
latest_src = (
    bronze
      .withColumn("_rn", F.row_number().over(w_latest))
      .filter(F.col("_rn") == 1)
      .select(
          F.col("ProviderID").alias("provider_id"),
          F.col("ProviderSpecialty").alias("provider_specialty"),
          F.col("ProviderLocation").alias("provider_location"),
          F.col("_ingest_ts"),
          F.col("_batch_id"),
          F.col("_source_file")
      )
      .withColumn("eff_start_ts", F.col("_ingest_ts"))
      .withColumn("eff_end_ts", F.lit(None).cast("timestamp"))
      .withColumn("is_current", F.lit(True))
)

In [0]:
# 3. Perform SCD Type 2 MERGE
#    Track history for changes in specialty or location

# Refresh so the newly created table is visible to the engine
spark.sql(f"REFRESH TABLE {TGT_TABLE}")

# Path-based DeltaTable to avoid metastore name resolution issues
delta = DeltaTable.forPath(spark, S.path).alias("t")
(
    delta
    .merge(
        latest_src.alias("s"),
        "t.provider_id = s.provider_id AND t.is_current = TRUE"
    )

    # CASE 1: If a current record exists and a tracked field has changed,
    #         close it by setting eff_end_ts and marking it as not current.
    .whenMatchedUpdate(
        condition=PROVIDER_CHANGE_CONDITION,
        set={
            "eff_end_ts": F.col("s.eff_start_ts"),
            "is_current": F.lit(False),
        },
    )

    # CASE 2: If no current row matches (e.g. new provider or a closed one),
    #         insert the incoming record as the new current version.
    .whenNotMatchedInsertAll()
    .execute()
)

In [0]:
# 3. Verify Silver Providers SCD2 output
df = spark.table(TGT_TABLE)
print(f"Silver Providers row count: {df.count():,}")
display(df.orderBy(F.col("eff_start_ts").desc()).limit(5))