In [None]:
%md
### Kardiaflow - Silver Claims (SCD1)

**Source:** `kardia_bronze.bronze_claims` (CDF)

**Target:** `kardia_silver.silver_claims` (SCD1 upsert)

**Pattern:** Read changed rows via CDF; deduplicate by claim, MERGE to update or insert (SCD1)

**Trigger:** One-shot incremental batch (`availableNow`)

In [0]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from kflow.config import bronze_table, CHANGE_TYPES, silver_paths
from kflow.notebook_utils import init, show_history

init()

# Load table paths and names for the Claims dataset
S         = silver_paths("claims")
SRC_TABLE = bronze_table("claims")
TGT_TABLE = S.table

In [0]:
# 1. Ensure Silver DB and Claims table exist
#    We define an explicit schema (including _ingest_ts) to decouple Silver from Bronze schema drift
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        claim_id                STRING NOT NULL,
        patient_id              STRING,
        provider_id             STRING,
        claim_amount            DOUBLE,
        claim_date              DATE,
        diagnosis_code          STRING,
        procedure_code          STRING,
        claim_status            STRING,
        claim_type              STRING,
        claim_submission_method STRING,
        _ingest_ts              TIMESTAMP,
        _batch_id               STRING,
        _source_file            STRING
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [0]:
# 2. Define upsert logic to apply SCD1 updates to Silver Claims
def upsert_to_silver(batch_df, _):
    # Retain inserts and updates only
    filtered = (
        batch_df
          .filter(F.col("_change_type").isin(*CHANGE_TYPES))
          .filter(F.col("ClaimID").isNotNull())
    )

    # Standardize column names and types
    renamed = (
        filtered.select(
            F.col("ClaimID").alias("claim_id"),
            F.col("PatientID").alias("patient_id"),
            F.col("ProviderID").alias("provider_id"),
            F.col("ClaimAmount").cast("double").alias("claim_amount"),
            F.to_date("ClaimDate").alias("claim_date"),
            F.col("DiagnosisCode").alias("diagnosis_code"),
            F.col("ProcedureCode").alias("procedure_code"),
            F.col("ClaimStatus").alias("claim_status"),
            F.col("ClaimType").alias("claim_type"),
            F.col("ClaimSubmissionMethod").alias("claim_submission_method"),
            F.col("_ingest_ts"),
            F.col("_batch_id"),
            F.col("_source_file"),
            F.col("_commit_version"),
            F.col("_commit_timestamp")
        )
    )

    # Retain only the latest version per claim_id in this micro-batch
    w_latest = (
        Window.partitionBy("claim_id")
              .orderBy(
                  F.col("_commit_version").desc(),
                  F.col("_commit_timestamp").desc()
              )
    )

    deduped_df = (
        renamed
          .withColumn("rn", F.row_number().over(w_latest))
          .filter("rn = 1")
          .drop("rn", "_commit_version", "_commit_timestamp")
    )
    
    # Final DataFrame used in MERGE
    latest_df = deduped_df

    # Upsert into Silver (update existing Claims, insert new ones)
    (
        DeltaTable.forName(spark, TGT_TABLE)
            .alias("t")
            .merge(latest_df.alias("s"), "t.claim_id = s.claim_id")
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
    )

In [0]:
# 3. Run incremental one-shot stream using CDF and upsert logic
(spark.readStream
      .format("delta")
      .option("readChangeFeed", "true")
      .option("startingVersion", 1) # Skip v0; table creation/metadata. Checkpoint will track after first run.
      .table(SRC_TABLE)

      .writeStream
      .foreachBatch(upsert_to_silver)
      .option("checkpointLocation", S.checkpoint)
      .trigger(availableNow=True)
      .start()
      .awaitTermination())

In [0]:
# 4. Verify Silver Claims table row count and preview records.
df = spark.table(TGT_TABLE)
print(f"Silver Claims row count: {df.count():,}")
display(df.orderBy(F.col("_ingest_ts").desc_nulls_last()).limit(5))
show_history(S.path)