In [0]:
# 02_silver_transform_encounters.ipynb
# Stream new rows from Bronze (CDF) into Silver.

from pyspark.sql import SparkSession, functions as F

# Paths and table names
BRONZE_TBL = "kardia_bronze.bronze_encounters"
SILVER_DB  = "kardia_silver"
SILVER_TBL = f"{SILVER_DB}.silver_encounters"
CHKPT_LOC  = "dbfs:/kardia/_checkpoints/silver_encounters"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# 1A. Ensure the Silver DB exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

In [0]:
# 1B. Read from Bronze CDF (SCD Type 1)
bronze_cdf = (
    spark.readStream
        .format("delta")
        .option("readChangeFeed", "true")
        .table(BRONZE_TBL)
        .filter(F.col("_change_type").isin("insert", "update_postimage"))
        .withWatermark("DATE", "1 day")
        .dropDuplicates(["ID"])
)


In [0]:
# 2. Rename and enrich into 10-column Silver schema
silver_ready = (
    bronze_cdf
        # rename to modelling-friendly names that match the join
        .withColumnRenamed("ID",      "EncounterID")
        .withColumnRenamed("PATIENT", "PatientID")
        .withColumnRenamed("DATE",    "START")
        
        # add required but missing cols (null/zero safe defaults)
        .withColumn("STOP",                F.lit(None).cast("timestamp"))
        .withColumn("BASE_ENCOUNTER_COST", F.lit(0.0).cast("double"))
        .withColumn("TOTAL_CLAIM_COST",    F.lit(0.0).cast("double"))
        
        # lock the final Silver schema / column order
        .select(
            "EncounterID", "PatientID", "START", "STOP",
            "CODE", "DESCRIPTION",
            "BASE_ENCOUNTER_COST", "TOTAL_CLAIM_COST",
            "REASONCODE", "REASONDESCRIPTION"
        )
)


In [0]:
# 3️. Write to Silver table (append mode)
(
    silver_ready.writeStream
    .format("delta")
    .partitionBy("START")
    .option("checkpointLocation", CHKPT_LOC)
    .outputMode("append")
    .trigger(availableNow=True)
    .toTable(SILVER_TBL)
)

print(f"Streamed new rows from {BRONZE_TBL} to {SILVER_TBL}")