In [None]:
# 02_silver_claims_scd1_batch.ipynb
# SOURCE: kardia_bronze.bronze_claims (CDF ON)
# TARGET: kardia_silver.silver_claims_current (SCD‑1)
# TRIGGER: availableNow (one‑shot incremental batch)

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

SILVER_DB = "kardia_silver"
SILVER_CLAIMS_TBL = "kardia_silver.silver_claims_current"
BRONZE_CLAIMS_TBL = "kardia_bronze.bronze_claims"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_claims"

# CDF event types we care about: insert and update_postimage only
CHANGE_TYPES = ("insert", "update_postimage")

In [None]:
# 1. Ensure Silver DB & target table exist.
#    We declare the Silver schema explicitly (add _ingest_ts) so downstream
#    code is stable even if Bronze evolves.

spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_CLAIMS_TBL} (
        ClaimID               STRING,
        PatientID             STRING,
        ProviderID            STRING,
        ClaimAmount           DOUBLE,
        ClaimDate             DATE,
        DiagnosisCode         STRING,
        ProcedureCode         STRING,
        ClaimStatus           STRING,
        ClaimType             STRING,
        ClaimSubmissionMethod STRING,
        _ingest_ts            TIMESTAMP,
        CONSTRAINT claimid_nn CHECK (ClaimID IS NOT NULL),
        CONSTRAINT claim_amount_nonneg CHECK (ClaimAmount >= 0)
    ) USING DELTA
    """
)


In [None]:
# 2.foreachBatch upsert
def upsert_to_silver_claims(batch_df, _):
    # keep only latest post‑image per ClaimID within this micro‑batch
    w_latest = Window.partitionBy("ClaimID").orderBy(F.col("_commit_version").desc())
    latest = (batch_df
              .filter(F.col("_change_type").isin(*CHANGE_TYPES))
              .withColumn("rn", F.row_number().over(w_latest))
              .filter("rn = 1")
              .drop("rn", "_change_type", "_commit_version", "_commit_timestamp")
              .withColumn("ClaimDate", F.to_date("ClaimDate"))
              .withColumn("_ingest_ts", F.current_timestamp()))

    (DeltaTable.forName(spark, SILVER_CLAIMS_TBL)
       .alias("t")
       .merge(latest.alias("s"), "t.ClaimID = s.ClaimID")
       .whenMatchedUpdateAll()
       .whenNotMatchedInsertAll()
       .execute())

In [None]:
# 3. Run incremental batch stream
(spark.readStream
       .format("delta")
       .option("readChangeFeed", "true")
       .table(BRONZE_CLAIMS_TBL)
       
       .writeStream
       .foreachBatch(upsert_to_silver_claims)
       .option("checkpointLocation", CHECKPOINT_PATH)
       .trigger(availableNow=True)
       .start()
       .awaitTermination())

print(f"Silver claims row count: {spark.table(SILVER_CLAIMS_TBL).count()}")