In [0]:
# join_silver_encounters_enriched.ipynb
# SOURCE:  Joins streaming Silver Encounters (CDF) with static Silver Patients to enrich demographics.
# OUTPUT:  `kardia_silver.silver_encounters_enriched`, upserted via MERGE.
# PATTERN: Stream–static left join (all encounters retained, even if patient is missing).
# TRIGGER:
# In batch mode: Reads available data and exits.
# In stream mode: Runs continuous 30s micro-batches.

# Install kflow from local wheel for use during job execution
%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow

from pyspark.sql import functions as F
from delta.tables import DeltaTable

from kflow.config import silver_paths

# Set catalog to Hive Metastore since Delta tables are stored in DBFS
spark.sql("USE CATALOG hive_metastore")

# Load Silver config for Encounters dataset
S = silver_paths("encounters_enriched")
TGT_TABLE = S.table

In [None]:
# Mode widget & flags
try:
    dbutils.widgets.dropdown("mode", "batch", ["batch", "stream"])
except:
    pass
MODE = dbutils.widgets.get("mode") if "dbutils" in globals() else "batch"
IS_BATCH = (MODE == "batch")
CHECKPOINT = f"{S.checkpoint}/{MODE}"

In [0]:
# 1. Ensure the target Delta table exists (with the correct schema).
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        encounter_id       STRING,
        patient_id         STRING,
        START_TS           TIMESTAMP,
        CODE               STRING,
        DESCRIPTION        STRING,
        REASONCODE         STRING,
        REASONDESCRIPTION  STRING,
        gender             STRING,
        birth_year         INT
    ) USING DELTA
    """
)

In [0]:
# 2. Read the streaming and static inputs
enc_stream  = spark.readStream.table("kardia_silver.silver_encounters")
patients_df = spark.table("kardia_silver.silver_patients")

In [0]:
# 3. Build the joined DataFrame
joined = (
    enc_stream.alias("e")
      .join(
        patients_df.alias("p"),
        F.col("e.patient_id") == F.col("p.id"),
        "left"
      )
      .select(
        F.col("e.encounter_id"),
        F.col("e.patient_id"),
        F.col("e.START_TS"),
        F.col("e.CODE"),
        F.col("e.DESCRIPTION"),
        F.col("e.REASONCODE"),
        F.col("e.REASONDESCRIPTION"),
        F.col("p.gender").alias("gender"),
        F.col("p.birth_year").alias("birth_year")
      )
)

In [0]:
# 4. Define upsert logic for each micro‑batch
def upsert_to_encounters(batch_df, batch_id):
    if batch_df.isEmpty():
        return

    (DeltaTable.forName(spark, TGT_TABLE)
               .alias("t")
               .merge(batch_df.alias("s"), "t.encounter_id = s.encounter_id")
               .whenMatchedUpdateAll()
               .whenNotMatchedInsertAll()
               .execute())

In [0]:
# 5. Start the continuous stream with a 30‑second trigger
writer = (
    joined.writeStream
          .foreachBatch(upsert_to_encounters)  # MERGE into enriched table
          .option("checkpointLocation", CHECKPOINT)
)

# Toggle trigger mode at runtime via job param "mode" (set with dbutils.widgets, default: "batch")
if IS_BATCH:
    # Batch mode: process all available files once and exit
    q = writer.trigger(availableNow=True).start()
    print(f"[demo] Draining stream(encounters)+static(patients) to {TGT_TABLE} (checkpoint={CHECKPOINT}) …")
    q.awaitTermination()
else:
    # Streaming mode: run continuously every 30s
    q = writer.trigger(processingTime="30 seconds").start()
    print(f"[live] Continuous 30s enrich to {TGT_TABLE} (checkpoint={CHECKPOINT})")