In [0]:
# 02_silver_encounters_transform.ipynb
# SOURCE: Stream encounter records from Bronze (with Delta CDF) into Silver.
# OUTPUT: `kardia_silver.silver_encounters`, updated incrementally.
# TRIGGER: Continuously read incremental inserts and updates from Bronze Encounters table.

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Table paths
SILVER_DB = "kardia_silver"
BRONZE_TABLE = "kardia_bronze.bronze_encounters"
SILVER_TABLE = f"{SILVER_DB}.silver_encounters"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_encounters"

# CDF event types we care about: insert and update_postimage only
CHANGE_TYPES = ["insert", "update_postimage"]

In [0]:
# 1. Ensure the Silver DB and Silver Encounters table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_TABLE} (
      encounter_id       STRING  NOT NULL,
      patient_id         STRING  NOT NULL,
      START_TS           TIMESTAMP,
      CODE               STRING,
      DESCRIPTION        STRING,
      REASONCODE         STRING,
      REASONDESCRIPTION  STRING
    ) USING DELTA
    """
)

In [0]:
# 2. Define the upsert logic.
#    For each batch, update or insert records by EncounterID from Delta CDF.
#    `batch_df` is a static DF containing the latest new and updated rows from Bronze CDF.
def upsert_to_silver(batch_df, batch_id):
    if batch_df.isEmpty():
        return

    DeltaTable.forName(spark, SILVER_TABLE) \
      .alias("target") \
      .merge(
         batch_df.alias("source"),
         # merge on *both* keys—although encounter_id is your primary,
         # including patient_id doubles down on correctness
         "target.encounter_id = source.encounter_id "
         "AND target.patient_id = source.patient_id"
      ) \
      .whenMatchedUpdateAll() \
      .whenNotMatchedInsertAll() \
      .execute()

In [0]:
# 3a. Read new changes from the Bronze Encounters table.
bronze_cdf = (
    spark.readStream
         .format("delta")
         .option("readChangeFeed", "true")
         .table(BRONZE_TABLE)
         .filter(F.col("_change_type").isin(*CHANGE_TYPES))
)

In [0]:
# 3b. Enrich to seven-column Silver schema.
silver_ready = (
    bronze_cdf
        .withColumnRenamed("ID",      "encounter_id")
        .withColumnRenamed("PATIENT", "patient_id")
        .withColumnRenamed("DATE",    "EVENT_DATE_STR")

        # Parse the raw date string into two formats:
        # - EVENT_DATE (DateType)    - Useful for analytics
        # - EVENT_TS (TimestampType) - If source starts sending real datetimes (future-proof)
        .withColumn("EVENT_DATE",      F.to_date("EVENT_DATE_STR",      "yyyy-MM-dd"))
        .withColumn("EVENT_TS",        F.to_timestamp("EVENT_DATE_STR", "yyyy-MM-dd"))

        # Select final schema (exclude staging columns)
        .selectExpr(
            "encounter_id",
            "patient_id",
            "EVENT_TS as START_TS",
            "CODE",
            "DESCRIPTION",
            "REASONCODE",
            "REASONDESCRIPTION"
        )
)
# NOTE: Timestamp is parsed as midnight in session time zone (UTC by default)

In [0]:
# 3c. Write to Silver table using foreachBatch + MERGE for upserts
#     Process available data every 30 seconds
query = (
    silver_ready.writeStream
                .foreachBatch(upsert_to_silver)
                .option("checkpointLocation", CHECKPOINT_PATH)
                .trigger(processingTime="30 seconds")
                .start()
)

displayHTML("""
    <div style='color:green; font-weight:bold;'>
        Stream started: silver_encounters<br>
        Trigger: every 30 sec • Source: CDF from bronze_encounters
    </div>
    """
)
query.awaitTermination()

# NOTE: No deduplication needed. Each EncounterID appears at most once per micro-batch since encounters are rarely updated.