In [0]:
# silver_feedback_batch_append.ipynb
# SOURCE: `kardia_bronze.bronze_feedback` (JSONL with audit metadata)
# OUTPUT: `kardia_silver.silver_feedback` (append-only, deduplicated)
# PATTERN: Batch MERGE on `feedback_id` to prevent duplicates
# TRIGGER: Incremental batch job (no SCD needed; feedback is immutable)

# Install kflow from local wheel for use during job execution
%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow

from pyspark.sql import functions as F
from pyspark.sql.types import MapType, StringType
from delta.tables import DeltaTable

from kflow.config import bronze_table, silver_paths

# Load Silver config for Feedback dataset
S = silver_paths("feedback")
SRC_TABLE = bronze_table("feedback")
TGT_TABLE = S.table

In [0]:
# 1. Ensure Silver DB and Feedback table exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        feedback_id        STRING  NOT NULL,
        provider_id        STRING,
        timestamp          TIMESTAMP,
        visit_id           STRING,
        satisfaction_score INT,
        comments           STRING,
        source             STRING,
        tags               ARRAY<STRING>,
        metadata           MAP<STRING,STRING>,
        _ingest_ts         TIMESTAMP,
        _source_file       STRING,
        _batch_id          STRING
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [0]:
# 2. Load Bronze records and prepare for deduplicated append
bronze_df = (
    spark.table(SRC_TABLE)
         .filter(F.col("feedback_id").isNotNull())
)

silver_src = (
    bronze_df
        .withColumn("timestamp", F.to_timestamp("timestamp"))
        .withColumn("metadata", F.from_json("metadata_json", MapType(StringType(), StringType())))
        .select(
            "feedback_id",
            "provider_id",
            "timestamp",
            "visit_id",
            "satisfaction_score",
            "comments",
            "source",
            "tags",
            "metadata",
            "_ingest_ts",
            "_source_file",
            "_batch_id"
        )
        .dropDuplicates(["feedback_id"])
)

In [0]:
# 3. MERGE new feedback records into Silver table
(DeltaTable.forName(spark, TGT_TABLE)
           .alias("t")
           .merge(silver_src.alias("s"), "t.feedback_id = s.feedback_id")
           .whenNotMatchedInsertAll()
           .execute())

In [0]:
# 4. Verify Silver Feedback table row count and preview records.
df = spark.table(TGT_TABLE)
print(f"Silver Feedback row count: {df.count():,}")
display(df.orderBy(F.col("_ingest_ts").desc()).limit(5))