In [0]:
# 02_silver_transform_encounters.ipynb
# -------------------------------------------------------
# Streams new rows from bronze_encounters into silver_encounters via CDF.

from pyspark.sql import SparkSession, functions as F

# Path config
BRONZE_TBL = "kardia_bronze.bronze_encounters"
SILVER_TBL = "kardia_silver.silver_encounters"
CHKPT_LOC  = "dbfs:/kardia/_checkpoints/silver_encounters"

In [0]:
# Spark session
spark = (SparkSession.builder
         .appName("silver_encounters_stream")
         .config("spark.sql.shuffle.partitions", "1")
         .getOrCreate())

In [0]:
# 1. Read CDF stream from Bronze
bronze_cdf = (spark.readStream
                 .format("delta")
                 .option("readChangeFeed", "true")
                 .table(BRONZE_TBL)
                 .filter(F.col("_change_type").isin("insert", "update_postimage"))
                 .dropDuplicates(["ID"]))

In [0]:
# 2. Rename & enrich to 10-column schema
silver_ready = (bronze_cdf
    # rename to modelling-friendly names that match the join
    .withColumnRenamed("ID",      "EncounterID")
    .withColumnRenamed("PATIENT", "PatientID")
    .withColumnRenamed("DATE",    "START")
    # add required but missing cols (null/zero safe defaults)
    .withColumn("STOP",                F.lit(None).cast("timestamp"))
    .withColumn("BASE_ENCOUNTER_COST", F.lit(0.0).cast("double"))
    .withColumn("TOTAL_CLAIM_COST",    F.lit(0.0).cast("double"))
    # lock the final Silver schema / column order
    .select("EncounterID", "PatientID", "START", "STOP",
            "CODE", "DESCRIPTION",
            "BASE_ENCOUNTER_COST", "TOTAL_CLAIM_COST",
            "REASONCODE", "REASONDESCRIPTION")
)

In [0]:
# 3️. Write to Silver (append-only)
(silver_ready.writeStream
     .format("delta")
     .option("checkpointLocation", CHKPT_LOC)
     .outputMode("append")
     .trigger(availableNow=True)
     .toTable(SILVER_TBL))