In [None]:
# 03_silver_patient_encounters_join.ipynb
# Joins Silver encounters (stream) with Silver patients (static) for enriched fact table.

from pyspark.sql import SparkSession, functions as F

CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_patient_encounters"
SILVER_PATIENTS_ENCOUNTERS = "kardia_silver.silver_patient_encounters"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [None]:
# 1. Load stream and static dimension
enc_stream = spark.readStream.table("kardia_silver.silver_encounters")
patients_df = spark.read.table("kardia_silver.silver_patients")

In [0]:
# 2. Join and project columns
joined = (
    enc_stream.alias("e")
        .join(F.broadcast(patients_df).alias("p"),
            F.col("e.PatientID") == F.col("p.ID"),
            "left")
        .select(
            "EncounterID", "PatientID",
            "START_TS", "STOP",
            "CODE", "BASE_ENCOUNTER_COST", "TOTAL_CLAIM_COST",
            F.coalesce("p.GENDER",      F.lit(None)).alias("GENDER"),
            F.coalesce("p.BIRTH_YEAR",  F.lit(None)).alias("BIRTH_YEAR")
        )
)

In [None]:
# 3. Write result to Delta
(joined.writeStream
       .format("delta")
       .option("checkpointLocation", CHECKPOINT_PATH)
       .outputMode("append")
       .trigger(availableNow=True)
       .toTable(SILVER_PATIENTS_ENCOUNTERS))