In [0]:
# join_silver_encounters_enriched.ipynb
# SOURCE:  Joins streaming Silver encounters with static Silver patients for enrichment.
# OUTPUT:  `kardia_silver.silver_encounters_enriched`, continuously upserted.
# PATTERN: Stream–static left join (all encounters retained, even when patient is missing).
# TRIGGER: Continuous micro‑batches; upsert into Delta table via MERGE.

from kflow.config import silver_paths

from delta.tables import DeltaTable
from pyspark.sql import functions as F

# Load Silver paths
S = silver_paths("encounters_enriched")
TGT_TABLE = S.table

In [0]:
# 1. Ensure the target Delta table exists (with the correct schema).
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        encounter_id       STRING,
        patient_id         STRING,
        START_TS           TIMESTAMP,
        CODE               STRING,
        DESCRIPTION        STRING,
        REASONCODE         STRING,
        REASONDESCRIPTION  STRING,
        gender             STRING,
        birth_year         INT
    ) USING DELTA
    """
)

In [0]:
# 2. Read the streaming and static inputs
enc_stream  = spark.readStream.table("kardia_silver.silver_encounters")
patients_df = spark.table("kardia_silver.silver_patients")

In [0]:
# 3. Build the joined DataFrame
joined = (
    enc_stream.alias("e")
      .join(
        patients_df.alias("p"),
        F.col("e.patient_id") == F.col("p.id"),
        "left"
      )
      .select(
        F.col("e.encounter_id"),
        F.col("e.patient_id"),
        F.col("e.START_TS"),
        F.col("e.CODE"),
        F.col("e.DESCRIPTION"),
        F.col("e.REASONCODE"),
        F.col("e.REASONDESCRIPTION"),
        F.col("p.gender").alias("gender"),
        F.col("p.birth_year").alias("birth_year")
      )
)

In [0]:
# 4. Define upsert logic for each micro‑batch
def upsert_to_encounters(batch_df, batch_id):
    if batch_df.isEmpty():
        return

    (DeltaTable.forName(spark, TGT_TABLE)
               .alias("t")
               .merge(batch_df.alias("s"), "t.encounter_id = s.encounter_id")
               .whenMatchedUpdateAll()
               .whenNotMatchedInsertAll()
               .execute())

In [0]:
# 5. Start the continuous stream with a 30‑second trigger
query = (
    joined.writeStream
          .foreachBatch(upsert_to_encounters)
          .option("checkpointLocation", S.checkpoint)
          .trigger(processingTime="30 seconds")
          .start()
)
print(f"Stream silver_encounters_enriched started. Source: stream(encounters) + static(patients), Sink: {TGT_TABLE}")