In [0]:
# 02_silver_feedback_batch_append.ipynb
# SOURCE: kardia_bronze.bronze_feedback (batch JSONL ingest with audit metadata)
# OUTPUT: kardia_silver.silver_feedback (append-only deduplicated)
# PATTERN: Batch MERGE to prevent duplicate feedback_id entries
# NOTE: Feedback is append-only; users don’t update prior records, so SCD1 isn’t needed.

from pyspark.sql import functions as F
from delta.tables import DeltaTable

SILVER_DB             = "kardia_silver"
BRONZE_FEEDBACK_TBL   = "kardia_bronze.bronze_feedback"
SILVER_FEEDBACK_TBL   = f"{SILVER_DB}.silver_feedback"

In [0]:
# 1. Ensure Silver database and table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {SILVER_FEEDBACK_TBL} (
  feedback_id        STRING  NOT NULL,
  provider_id        STRING,
  timestamp          TIMESTAMP,
  visit_id           STRING,
  satisfaction_score INT,
  comments           STRING,
  source             STRING,
  tags               ARRAY<STRING>,
  metadata           MAP<STRING,STRING>,
  _ingest_ts         TIMESTAMP,
  _source_file       STRING,
  _batch_id          STRING
) USING DELTA
""")

In [0]:
# 2. Load Bronze feedback records and prepare for Silver
bronze_df = spark.table(BRONZE_FEEDBACK_TBL)

silver_src = (
  bronze_df
    # parse ISO timestamp into proper TIMESTAMP type
    .withColumn("timestamp", F.to_timestamp("timestamp"))
    # select only the columns needed in Silver
    .select(
      "feedback_id",
      "provider_id",
      "timestamp",
      "visit_id",
      "satisfaction_score",
      "comments",
      "source",
      "tags",
      "metadata",
      "_ingest_ts",
      "_source_file",
      "_batch_id"
    )
)

In [0]:
# 3. Merge Bronze into Silver to append new feedback only
delta_tbl = DeltaTable.forName(spark, SILVER_FEEDBACK_TBL)

(delta_tbl.alias("t")
  .merge(
    silver_src.alias("s"),
    "t.feedback_id = s.feedback_id"
  )
  # insert only if not already present
  .whenNotMatchedInsertAll()
  .execute()
)

In [0]:
# 4. Validation: print resulting Silver row count
silver_count = spark.table(SILVER_FEEDBACK_TBL).count()
print(f"Silver feedback record count: {silver_count}")