In [0]:
# 02_silver_encounters_scd1_batch.ipynb
# SOURCE: kardia_bronze.bronze_encounters (Delta CDF)
# OUTPUT: kardia_silver.silver_encounters (SCD-1 upserts)
# PATTERN: Read Change Data Feed from Bronze, derive `START_TS`, and upsert via MERGE by EncounterID.
# TRIGGER: Incremental batch job

from pyspark.sql import functions as F
from delta.tables import DeltaTable

SILVER_DB = "kardia_silver"
BRONZE_ENCOUNTERS_TBL = "kardia_bronze.bronze_encounters"
SILVER_ENCOUNTERS_TBL = f"{SILVER_DB}.silver_encounters"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_encounters"

# CDF event types we care about: insert and update_postimage only
CHANGE_TYPES = ("insert", "update_postimage")

In [0]:
# 1. Ensure the Silver DB and Silver Encounters table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_ENCOUNTERS_TBL} (
        encounter_id        STRING  NOT NULL,
        patient_id          STRING  NOT NULL,
        start_ts            TIMESTAMP,
        code                STRING,
        description         STRING,
        reason_code         STRING,
        reason_description  STRING
    ) USING DELTA
    """
)

In [0]:
# 2. Define the upsert logic.
#    For each batch, update or insert records by encounter_id from Delta CDF.
#    `batch_df` is a static DF containing the latest new and updated rows from Bronze CDF.
def upsert_to_silver_encounters(batch_df, _):
    (DeltaTable.forName(spark, SILVER_ENCOUNTERS_TBL)
               .alias("t")
               .merge(batch_df.alias("s"), "t.encounter_id = s.encounter_id")
               .whenMatchedUpdateAll()
               .whenNotMatchedInsertAll()
               .execute())

In [0]:
# 3a. Read new changes from the Bronze Encounters table.
bronze_cdf = (
    spark.readStream
         .format("delta")
         .option("readChangeFeed", "true")
         .table(BRONZE_ENCOUNTERS_TBL)
         .filter(F.col("_change_type").isin(*CHANGE_TYPES))
)

In [0]:
# 3b.  Enrich & rename columns to the Silver schema
silver_ready = (
    bronze_cdf
        .filter(F.col("_change_type").isin(*CHANGE_TYPES))
        .select(
            F.col("ID").alias("encounter_id"),
            F.col("PATIENT").alias("patient_id"),
            F.col("DATE").cast("timestamp").alias("start_ts"),
            F.col("CODE").alias("code"),
            F.col("DESCRIPTION").alias("description"),
            F.col("REASONCODE").alias("reason_code"),
            F.col("REASONDESCRIPTION").alias("reason_description")
        )
)

In [0]:
# 4. Run incremental batch
(silver_ready.writeStream
        .foreachBatch(upsert_to_silver_encounters)
        .option("checkpointLocation", CHECKPOINT_PATH)
        .trigger(availableNow=True)
        .start()
        .awaitTermination())

print("Silver Encounters incremental batch complete.")