In [0]:
# 02_silver_claims_current.ipynb
# SOURCE:  kardia_bronze.bronze_claims  (Delta CDF ON)
# OUTPUT:  kardia_silver.silver_claims_current  (latest row per ClaimID)
# TRIGGER: Batch job; keeps progress in kardia_silver.__pipeline_metadata.

from delta.tables import DeltaTable
from pyspark.sql  import functions as F, Window

# Table paths
BRONZE_TABLE = "kardia_bronze.bronze_claims"
SILVER_TABLE = "kardia_silver.silver_claims_current"
META_TABLE   = "kardia_silver.__pipeline_metadata"
META_KEY     = "claims_last_version"

In [0]:
# 1. Ensure Silver DB and meta table exist
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_silver")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {META_TABLE} (
      key          STRING,
      last_version LONG
    ) USING DELTA
    """
)

In [0]:
# 2. Determine last processed Bronze commit version
last_version = (
    spark.sql(
        f"SELECT COALESCE(MAX(last_version), -1) AS v "
        f"FROM {META_TABLE} WHERE key = '{META_KEY}'"
    )
    .first()
    .v
)

In [0]:
# 3. Read new inserts/updates from Bronze via CDF
cdf_df = (
    spark.read.format("delta")
              .option("readChangeData", "true")
              .option("startingVersion", last_version + 1)
              .table(BRONZE_TABLE)
              .filter("_change_type IN ('insert', 'update_postimage')")
)

if cdf_df.limit(1).count() == 0:
    print("No new changes – exiting.")
    dbutils.notebook.exit("SKIPPED")

new_version = cdf_df.agg(F.max("_commit_version").alias("v")).first().v

claims_df = (
    cdf_df.drop("_change_type", "_commit_version", "_commit_timestamp")
          .withColumn("_ingest_ts", F.current_timestamp())
)

In [0]:
# 4. Deduplicate → keep latest _commit_version per ClaimID
w = Window.partitionBy("ClaimID").orderBy(F.col("_commit_version").desc())

latest_df = (
    cdf_df.withColumn("rn", F.row_number().over(w))
          .filter("rn = 1")
          .drop("rn")
)

new_version = latest_df.agg(F.max("_commit_version").alias("v")).first().v

claims_df = (
    latest_df.drop("_change_type", "_commit_version", "_commit_timestamp")
             .withColumn("_ingest_ts", F.current_timestamp())
)

In [0]:
# 5. Upsert (SCD‑1) into Silver table
if not spark.catalog.tableExists(SILVER_TABLE):
    claims_df.limit(0).write \
             .format("delta") \
             .option("mergeSchema", "true") \
             .saveAsTable(SILVER_TABLE)

In [0]:
# 6. Upsert (SCD‑1) into Silver
(
    DeltaTable.forName(spark, SILVER_TABLE)
              .alias("t")
              .merge(claims_df.alias("s"), "t.ClaimID = s.ClaimID")
              .whenMatchedUpdateAll()
              .whenNotMatchedInsertAll()
              .execute()
)

In [0]:
# 7. Persist latest processed version
spark.sql(
    f"""
    MERGE INTO {META_TABLE} t
    USING (SELECT '{META_KEY}' AS key, {new_version} AS last_version) s
    ON t.key = s.key
    WHEN MATCHED THEN UPDATE SET last_version = s.last_version
    WHEN NOT MATCHED THEN INSERT *
    """
)

print(f"Processed Bronze through version {new_version}.")