In [0]:
# 02_silver_claims_scd1_batch.ipynb
# SOURCE: kardia_bronze.bronze_claims (CDF ON)
# TARGET: kardia_silver.silver_claims (SCD‑1)
# TRIGGER: availableNow (one‑shot incremental batch)

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

SILVER_DB = "kardia_silver"
SILVER_CLAIMS_TBL = "kardia_silver.silver_claims"
BRONZE_CLAIMS_TBL = "kardia_bronze.bronze_claims"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_claims"

# CDF event types we care about: insert and update_postimage only
CHANGE_TYPES = ("insert", "update_postimage")

In [0]:
# 1. Ensure Silver DB & target table exist.
#    We declare the Silver schema explicitly (add _ingest_ts) so downstream
#    code is stable even if Bronze evolves.

spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    """
    CREATE TABLE IF NOT EXISTS kardia_silver.silver_claims (
        claim_id                STRING  NOT NULL,
        patient_id              STRING,
        provider_id             STRING,
        claim_amount            DOUBLE,
        claim_date              DATE,
        diagnosis_code          STRING,
        procedure_code          STRING,
        claim_status            STRING,
        claim_type              STRING,
        claim_submission_method STRING,
        _ingest_ts              TIMESTAMP
    ) USING DELTA
    """
)

In [0]:
# 2. foreachBatch upsert (rename to snake_case and cast types)
def upsert_to_silver_claims(batch_df, _):

    # Keep only inserts & update_postimage rows from the change feed
    filtered = batch_df.filter(F.col("_change_type").isin(*CHANGE_TYPES))

    # Rename columns + cast
    renamed = (
        filtered.select(                      # one rename / cast selector
            F.col("ClaimID").alias("claim_id"),
            F.col("PatientID").alias("patient_id"),
            F.col("ProviderID").alias("provider_id"),
            F.col("ClaimAmount").cast("double").alias("claim_amount"),
            F.to_date("ClaimDate").alias("claim_date"),
            F.col("DiagnosisCode").alias("diagnosis_code"),
            F.col("ProcedureCode").alias("procedure_code"),
            F.col("ClaimStatus").alias("claim_status"),
            F.col("ClaimType").alias("claim_type"),
            F.col("ClaimSubmissionMethod").alias("claim_submission_method"),
            F.col("_ingest_ts"),
            F.col("_commit_version"),
            F.col("_commit_timestamp"),
            F.col("_change_type")
        )
    )

    # keep latest post‑image per claim_id in this micro‑batch
    w_latest = Window.partitionBy("claim_id").orderBy(F.col("_commit_version").desc())

    latest = (renamed
              .withColumn("rn", F.row_number().over(w_latest))
              .filter("rn = 1")
              .drop("rn", "_change_type", "_commit_version", "_commit_timestamp"))
    
    # Merge into Silver
    (DeltaTable.forName(spark, SILVER_CLAIMS_TBL)
       .alias("t")
       .merge(latest.alias("s"), "t.claim_id = s.claim_id")
       .whenMatchedUpdateAll()
       .whenNotMatchedInsertAll()
       .execute())

In [0]:
# 3. Run incremental batch stream
(spark.readStream
       .format("delta")
       .option("readChangeFeed", "true")
       .table(BRONZE_CLAIMS_TBL)
       
       .writeStream
       .foreachBatch(upsert_to_silver_claims)
       .option("checkpointLocation", CHECKPOINT_PATH)
       .trigger(availableNow=True)
       .start()
       .awaitTermination())

In [None]:
# 4. Stream finished – Verify Silver Claims table row count and checkpoint path.

silver_df = spark.table(SILVER_CLAIMS_TBL)
count = silver_df.count()

displayHTML(f"<div style='color:green; font-weight:bold'>Silver row count: {count}</div>")
displayHTML(f"<div style='color:gray'>Checkpoint: {CHECKPOINT_PATH}</div>")
display(silver_df.limit(5))