In [None]:
# 02_silver_encounters_transform.ipynb
# SOURCE: Stream encounter records from Bronze (with Delta CDF) into Silver.
# OUTPUT: `kardia_silver.silver_encounters`, updated incrementally.
# TRIGGER: Continuously read incremental inserts and updates from Bronze Encounters table.
# TABLE STORAGE:  Partitions by ENCOUNTER_MONTH

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Table paths
SILVER_DB       = "kardia_silver"
BRONZE_TABLE    = "kardia_bronze.bronze_encounters"
SILVER_TABLE    = f"{SILVER_DB}.silver_encounters"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_encounters"
CHANGE_TYPES    = ["insert", "update_postimage"]

In [None]:
# 1. Ensure the Silver DB and Silver Encounters table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_TABLE} (
      EncounterID        STRING,
      PatientID          STRING,
      START_TS           TIMESTAMP,
      ENCOUNTER_MONTH    STRING,
      CODE               STRING,
      DESCRIPTION        STRING,
      REASONCODE         STRING,
      REASONDESCRIPTION  STRING
    ) USING DELTA
    PARTITIONED BY (ENCOUNTER_MONTH)
    """
)

In [None]:
# 2. Define the upsert logic.
#    For each batch, update or insert records by EncounterID from Delta CDF.
#    `batch_df` is a static DF containing the latest new and updated rows from Bronze CDF.
def upsert_to_silver(batch_df, _):
    if batch_df.isEmpty():
        return

    (DeltaTable.forName(spark, SILVER_TABLE)
               .alias("target")
               .merge(batch_df.alias("source"), "target.EncounterID = source.EncounterID")
               .whenMatchedUpdateAll()
               .whenNotMatchedInsertAll()
               .execute())

In [0]:
# 3a. Read new changes from the Bronze Encounters table.
bronze_cdf = (
    spark.readStream
         .format("delta")
         .option("readChangeFeed", "true")
         .table(BRONZE_TABLE)
         .filter(F.col("_change_type").isin(*CHANGE_TYPES))
)

In [None]:
# 3b. Enrich to eight-column Silver schema.
silver_ready = (
    bronze_cdf
        .withColumnRenamed("ID",      "EncounterID")
        .withColumnRenamed("PATIENT", "PatientID")
        .withColumnRenamed("DATE",    "EVENT_DATE_STR")

        # Parse the raw date string into two formats:
        # - EVENT_DATE (DateType)    - Useful for analytics
        # - ENCOUNTER_MONTH (String) - Serves as a stable partition key
        # - EVENT_TS (TimestampType) - If source starts sending real datetimes (future-proof)
        .withColumn("EVENT_DATE",      F.to_date("EVENT_DATE_STR",      "yyyy-MM-dd"))
        .withColumn("ENCOUNTER_MONTH", F.date_format("EVENT_DATE",      "yyyy-MM"))
        .withColumn("EVENT_TS",        F.to_timestamp("EVENT_DATE_STR", "yyyy-MM-dd"))

        # Select final schema (exclude staging columns)
        .selectExpr(
            "EncounterID",
            "PatientID",
            "EVENT_TS as START_TS",
            "ENCOUNTER_MONTH",
            "CODE",
            "DESCRIPTION",
            "REASONCODE",
            "REASONDESCRIPTION"
        )
)
# NOTE: Timestamp is parsed as midnight in session time zone (UTC by default)

In [None]:
# 3c. Write to Silver table using foreachBatch + MERGE for upserts
#     Process available data every 30 seconds
query = (
    silver_ready.writeStream
                .foreachBatch(upsert_to_silver)
                .option("checkpointLocation", CHECKPOINT_PATH)
                .trigger(processingTime="30 seconds")
                .start()
)

# Confirm the stream has started.
print("Silver Encounters stream started.")
print(query.status)

# NOTE: No deduplication needed. Each EncounterID appears at most once per micro-batch since encounters are rarely updated.