In [0]:
# 03_silver_enc_demographics_join.ipynb
# SOURCE:  Joins streaming Silver encounters with static Silver patients for enrichment.
# OUTPUT:  `kardia_silver.silver_encounters_demographics`, written as an incremental batch.
# PATTERN: Stream–static left join (keeps all encounters even when the patient is missing).
# TRIGGER: Incremental batch; append to Delta table with fixed schema.

from pyspark.sql import functions as F

SILVER_ENCOUNTERS_DEMOGRAPHICS = "kardia_silver.silver_encounters_demographics"
CHECKPOINT_PATH                = "dbfs:/kardia/_checkpoints/silver_encounters_demographics"

In [0]:
# 1. Read existing Patients and Encounters Delta tables.

# Spark treats `enc_stream` as unbounded, incremental input. (readStream)
enc_stream = spark.readStream.table("kardia_silver.silver_encounters")

# Spark treats `patients_df` as a bounded snapshot. (spark.table)
patients_df = spark.table("kardia_silver.silver_patients")

In [0]:
# 2. Perform a left-join to retain all encounter records, including those with no matching patient.
#    In the Gold layer, these unmatched rows are tracked for QA purposes.
joined = (
    enc_stream.alias("e")
              .join(
                  patients_df.alias("p"),
                  F.col("e.PatientID") == F.col("p.ID"),
                  "left"
               )
              .select(
                  "e.EncounterID",
                  "e.PatientID",
                  "e.START_TS",
                  "e.CODE",
                  "e.DESCRIPTION",
                  "e.REASONCODE",
                  "e.REASONDESCRIPTION",
                  "p.GENDER",
                  "p.BIRTH_YEAR"
              )
)

In [0]:
# 3. Write incremental batch output.

#    Execution:
#    1. Each micro-batch of new encounter rows is joined to the static patients_df snapshot.
#    2. Spark automatically broadcasts patients_df if it is below 10 MB (autoBroadcastJoinThreshold default).
#    3. Joined rows are appended to silver_encounters_demographics.
action = (
        joined.writeStream
              .outputMode("append")
              .option("checkpointLocation", CHECKPOINT_PATH)
              .trigger(availableNow=True)
              .table(SILVER_ENCOUNTERS_DEMOGRAPHICS)
              .awaitTermination()
)

print("silver_encounters_demographics refreshed (stream‑static join)")

# NOTE:
# - `availableNow=true` tells Spark to process all available data in micro-batches, then stop.
# - The Encounters side refreshes every micro-batch; the Patients side is static snapshot from job start.