In [None]:
%md
### Kardiaflow - Silver Feedback (Append-only)

**Source:** `kardia_bronze.bronze_feedback` (JSONL with audit metadata)

**Target:** `kardia_silver.silver_feedback` (append-only)

**Pattern:** Deduplicate within batch by `feedback_id`; MERGE with insert-only semantics

**Trigger:** Incremental batch job

Notes: Feedback is immutable; SCD not required.

In [0]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.types import MapType, StringType
from pyspark.sql.window import Window

from kflow.config import bronze_table, silver_paths
from kflow.notebook_utils import init, show_history

init()

# Load table paths and names for the Feedback dataset
S         = silver_paths("feedback")
SRC_TABLE = bronze_table("feedback")
TGT_TABLE = S.table

In [0]:
# 1. Ensure Silver DB and Feedback table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        feedback_id        STRING  NOT NULL,
        provider_id        STRING,
        timestamp          TIMESTAMP,
        visit_id           STRING,
        satisfaction_score INT,
        comments           STRING,
        source             STRING,
        tags               ARRAY<STRING>,
        metadata           MAP<STRING,STRING>,
        _ingest_ts         TIMESTAMP,
        _source_file       STRING,
        _batch_id          STRING
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [0]:
# 2. Load Bronze records and prepare deduplicated batch
bronze_df = (
    spark.table(SRC_TABLE)
         .filter(F.col("feedback_id").isNotNull())
)

# Parse types, select relevant fields, and deduplicate by `feedback_id`
base_src = (
    bronze_df
      .withColumn("timestamp", F.to_timestamp("timestamp"))
      .withColumn("metadata", F.from_json("metadata_json", MapType(StringType(), StringType())))
      .select(
          "feedback_id",
          "provider_id",
          "timestamp",
          "visit_id",
          "satisfaction_score",
          "comments",
          "source",
          "tags",
          "metadata",
          "_ingest_ts",
          "_source_file",
          "_batch_id"
      )
)

# Retain only the latest version per feedback_id in this micro-batch
w_latest = Window.partitionBy("feedback_id").orderBy(
      F.col("timestamp").desc_nulls_last(),
      F.col("_ingest_ts").desc_nulls_last()
)

deduped_df = (
    base_src
      .withColumn("_rn", F.row_number().over(w_latest))
      .filter(F.col("_rn") == 1)
      .drop("_rn")
)

# Final DataFrame used in MERGE
latest_df = deduped_df

In [0]:
# 3. MERGE (insert) new feedback records into Silver table

# Refresh so the newly created table is visible to the engine
spark.sql(f"REFRESH TABLE {TGT_TABLE}")

(
    DeltaTable.forPath(spark, S.path)
              .alias("t")
              .merge(latest_df.alias("s"), "t.feedback_id = s.feedback_id")
              .whenNotMatchedInsertAll()
              .execute()
)

In [0]:
# 4. Verify Silver Feedback table row count and preview records.
df = spark.table(TGT_TABLE)
print(f"Silver Feedback row count: {df.count():,}")
display(df.orderBy(F.col("_ingest_ts").desc()).limit(5))
show_history(S.path)