In [0]:
# 02_silver_encounters_transform.ipynb
# Stream new rows from Bronze (CDF) into Silver.

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Table paths
SILVER_DB = "kardia_silver"
BRONZE_ENCOUNTERS_TABLE = "kardia_bronze.bronze_encounters"
SILVER_ENCOUNTERS_TABLE = f"{SILVER_DB}.silver_encounters"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_encounters"

In [0]:
# Ensure the Silver DB exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

In [0]:
# 1. Continuously read from Bronze CDF (SCD Type 1)
bronze_cdf = (
    spark.readStream
        .format("delta")
        .option("readChangeFeed", "true")
        .table(BRONZE_ENCOUNTERS_TABLE)
        .filter(F.col("_change_type").isin("insert", "update_postimage"))
)

In [0]:
# 2. Rename and enrich into 8-column Silver schema
silver_ready = (
    bronze_cdf
        .withColumnRenamed("ID", "EncounterID")
        .withColumnRenamed("PATIENT", "PatientID")
        .withColumnRenamed("DATE", "EVENT_DATE_STR")

        # Parse the raw date string into two formats:
        # EVENT_DATE (DateType) - Useful for analytics
        # EVENT_TS (TimestampType) - Required for watermarking
        # Note: Timestamp is parsed as midnight in session time zone (UTC by default)
        .withColumn("EVENT_DATE", F.to_date("EVENT_DATE_STR", "yyyy-MM-dd"))
        .withColumn("EVENT_TS", F.to_timestamp("EVENT_DATE_STR", "yyyy-MM-dd"))
        .withWatermark("EVENT_TS", "1 day")

        # START_DATE is copied from EVENT_DATE - Serves as a partition column
        .withColumn("START_DATE", F.col("EVENT_DATE"))

        # Select final schema (exclude staging columns)
        .selectExpr(
            "EncounterID",
            "PatientID",
            "EVENT_TS as START_TS",
            "START_DATE",
            "CODE",
            "DESCRIPTION",
            "REASONCODE",
            "REASONDESCRIPTION"
        )
)

In [0]:
# 3. Ensure target table exists (empty static DF)
(
    spark.createDataFrame([], silver_ready.schema)
        .write
        .format("delta")
        .partitionBy("START_DATE")
        .mode("ignore")
        .saveAsTable(SILVER_ENCOUNTERS_TABLE)
)

# 4. foreachBatch with DeltaTable API
def upsert_to_silver(batch_df, _):
    if batch_df.isEmpty():
        return

    target = DeltaTable.forName(batch_df.sparkSession, SILVER_ENCOUNTERS_TABLE)
    (
        target.alias("t")
           .merge(batch_df.alias("s"), "t.EncounterID = s.EncounterID")
           .whenMatchedUpdateAll()
           .whenNotMatchedInsertAll()
           .execute()
    )

query = (
    silver_ready.writeStream
                .foreachBatch(upsert_to_silver)
                .option("checkpointLocation", CHECKPOINT_PATH)
                .trigger(processingTime="30 seconds")
                .start()
)

print("Silver-encounters continuous MERGE stream started.")
print(query.status)

In [0]:
# query.stop()