In [0]:
# 02_silver_claims_current_min.py
# SOURCE : kardia_bronze.bronze_claims   (CDF ON)
# TARGET : kardia_silver.silver_claims_current  (SCD‑1)
# TRIGGER: availableNow (one‑shot incremental batch)

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

SILVER_CLAIMS_TBL = "kardia_silver.silver_claims_current"
BRONZE_CLAIMS_TBL = "kardia_bronze.bronze_claims"

CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_claims"

In [0]:
# 1. Ensure Silver DB & target table exist.
#    We declare the Silver schema explicitly (adds _ingest_ts) so downstream
#    code is stable even if Bronze evolves.

spark.sql("CREATE DATABASE IF NOT EXISTS kardia_silver")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {SILVER_CLAIMS_TBL} (
  ClaimID               STRING,
  PatientID             STRING,
  ProviderID            STRING,
  ClaimAmount           DOUBLE,
  ClaimDate             DATE,
  DiagnosisCode         STRING,
  ProcedureCode         STRING,
  ClaimStatus           STRING,
  ClaimType             STRING,
  ClaimSubmissionMethod STRING,
  _ingest_ts            TIMESTAMP
) USING DELTA
""")


In [0]:
# 2.foreachBatch upsert
def upsert(batch_df, _):
    # keep only latest post‑image per ClaimID within this micro‑batch
    w = Window.partitionBy("ClaimID").orderBy(F.col("_commit_version").desc())
    latest = (batch_df
              .filter("_change_type IN ('insert','update_postimage')")
              .withColumn("rn", F.row_number().over(w))
              .filter("rn = 1")
              .drop("rn", "_change_type", "_commit_version", "_commit_timestamp")
              .withColumn("ClaimDate", F.to_date("ClaimDate"))
              .withColumn("_ingest_ts", F.current_timestamp()))

    (DeltaTable.forName(spark, SILVER_CLAIMS_TBL)
       .alias("t")
       .merge(latest.alias("s"), "t.ClaimID = s.ClaimID")
       .whenMatchedUpdateAll()
       .whenNotMatchedInsertAll()
       .execute())

In [0]:
# 3. Kick off incremental batch run
(spark.readStream
       .format("delta")
       .option("readChangeFeed", "true")
       .table(BRONZE_CLAIMS_TBL)
       
       .writeStream
       .foreachBatch(upsert)
       .option("checkpointLocation", CHECKPOINT_PATH)
       .trigger(availableNow=True)
       .start()
       .awaitTermination())

print(f"Silver row count: {spark.table(SILVER_CLAIMS_TBL).count()}")