In [0]:
# 02_silver_encounters_transform.ipynb
# SOURCE: Stream encounter records from Bronze (with Delta CDF) into Silver.
# OUTPUT: `kardia_silver.silver_encounters`, updated incrementally.
# TRIGGER: Continuously read incremental inserts and updates from Bronze Encounters table.

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Table paths
SILVER_DB       = "kardia_silver"
BRONZE_TABLE    = "kardia_bronze.bronze_encounters"
SILVER_TABLE    = f"{SILVER_DB}.silver_encounters"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_encounters"
CHANGE_TYPES    = ["insert", "update_postimage"]

In [0]:
# 1. Ensure the Silver DB and Silver Encounters table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_TABLE} (
      EncounterID        STRING,
      PatientID          STRING,
      START_TS           TIMESTAMP,
      CODE               STRING,
      DESCRIPTION        STRING,
      REASONCODE         STRING,
      REASONDESCRIPTION  STRING
    ) USING DELTA
    """
)

In [0]:
# 2. Define the upsert logic.
#    For each batch, update or insert records by EncounterID from Delta CDF.
#    `batch_df` is a static DF containing the latest new and updated rows from Bronze CDF.
def upsert_to_silver(batch_df, _):
    (DeltaTable.forName(spark, SILVER_TABLE)
        .alias("t")
        .merge(batch_df.alias("s"), "t.EncounterID = s.EncounterID")
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute())

In [0]:
# 3a. Read new changes from the Bronze Encounters table.
bronze_cdf = (
    spark.readStream
         .format("delta")
         .option("readChangeFeed", "true")
         .table(BRONZE_TABLE)
         .filter(F.col("_change_type").isin(*CHANGE_TYPES))
)

In [0]:
# 3b. Enrich to seven-column Silver schema.
silver_ready = (
    bronze_cdf
        .withColumnRenamed("ID",      "EncounterID")
        .withColumnRenamed("PATIENT", "PatientID")
        .withColumn("START_TS", F.col("DATE").cast("timestamp"))
        .select("EncounterID",
                "PatientID",
                "START_TS",
                "CODE",
                "DESCRIPTION",
                "REASONCODE",
                "REASONDESCRIPTION"
        )
)

In [0]:
# 3c. Write to Silver table using foreachBatch + MERGE for upserts
#     Process available data every 30 seconds
(silver_ready.writeStream
              .foreachBatch(upsert_to_silver)
              .option("checkpointLocation", CHECKPOINT_PATH)
              .trigger(processingTime="30 seconds")
              .start()
              .awaitTermination())

# Confirm the stream has started.
print("Silver Encounters stream started.")
print(query.status)

# NOTE: No deduplication needed. Each EncounterID appears at most once per micro-batch since encounters are rarely updated.