In [0]:
# bronze_feedback_copy_into.ipynb
# SOURCE: Raw JSON-lines files in ADLS
# TARGET: `kardia_bronze.bronze_feedback` (CDF)
# TRIGGER: Incremental batch via COPY INTO; append to Bronze Feedback table
# Feedback arrives in small, asynchronous batches - COPY INTO is simple and stateless
# Patients and Providers may arrive continuously or in date partitions, making Auto Loader’s checkpointing a better fit

from pyspark.sql.types import (StructType, StructField, StringType, IntegerType,
                               ArrayType, MapType)

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import BRONZE_DB, bronze_paths, current_batch_id
from kflow.display_utils import show_history

# Configure Spark with ADLS OAuth credentials and return base ABFS path
abfss_base = ensure_adls_oauth()

# Set catalog to Hive Metastore (required when not using Unity Catalog)
spark.sql("USE CATALOG hive_metastore")

# Load table paths and names for the Feedback dataset (paths, table, schema, etc.)
P            = bronze_paths("feedback")
BRONZE_TABLE = P.table
BATCH_ID     = current_batch_id()

In [0]:
# Define schema explicitly for JSONL input
# JSONL doesn’t include schema metadata and inference is unreliable
feedback_schema = StructType([
    StructField("feedback_id",        StringType(), True),
    StructField("provider_id",        StringType(), True),
    StructField("timestamp",          StringType(), True),
    StructField("visit_id",           StringType(), True),
    StructField("satisfaction_score", IntegerType(), True),
    StructField("comments",           StringType(), True),
    StructField("source",             StringType(), True),
    StructField("tags",               ArrayType(StringType()), True),
    StructField("metadata",           MapType(StringType(), StringType()), True),
])

In [0]:
# 1. Ensure Bronze DB and Feedback table exist
# - COPY INTO requires the target Delta table to already exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE} (
      feedback_id        STRING NOT NULL,
      provider_id        STRING,
      timestamp          STRING,
      visit_id           STRING,
      satisfaction_score INT,
      comments           STRING,
      source             STRING,
      tags               ARRAY<STRING>,
      metadata_json      STRING,
      _ingest_ts         TIMESTAMP,
      _source_file       STRING,
      _batch_id          STRING
    )
    USING DELTA
    COMMENT 'Bronze JSONL ingest of Feedback records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 2. Run batch operation
#    COPY INTO scans the entire source path each run
spark.sql(
    f"""
    COPY INTO {BRONZE_TABLE}
    FROM (
      SELECT
        CAST(feedback_id        AS STRING)            AS feedback_id,
        CAST(provider_id        AS STRING)            AS provider_id,
        CAST(timestamp          AS STRING)            AS timestamp,
        CAST(visit_id           AS STRING)            AS visit_id,
        CAST(satisfaction_score AS INT)               AS satisfaction_score,
        CAST(comments           AS STRING)            AS comments,
        CAST(source             AS STRING)            AS source,
        CAST(tags               AS ARRAY<STRING>)     AS tags,
        to_json(metadata)                             AS metadata_json,
        current_timestamp()                           AS _ingest_ts,
        input_file_name()                             AS _source_file,
        '{BATCH_ID}'                                  AS _batch_id
      FROM '{P.raw}'
    )
    FILEFORMAT = JSON
    FORMAT_OPTIONS ('multiLine' = 'false')
    COPY_OPTIONS ('mergeSchema' = 'false')
    """
)

In [0]:
# 3. Batch finished. Verify Bronze Feedback table and history
df = spark.table(BRONZE_TABLE)
print(f"Bronze Feedback row count: {df.count():,}")
display(df.limit(5))
show_history(P.bronze)