In [None]:
# bronze_feedback_copy_into.ipynb
# SOURCE:  JSON‑lines feedback files in ADLS at abfss://raw@kardiaadlsdemo.dfs.core.windows.net/feedback/  
# OUTPUT: `kardia_bronze.bronze_feedback` with Change Data Feed enabled  
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled  

# NOTE: Using Auto Loader for JSONL; schema evolves automatically via schemaLocation.

from kflow.config import bronze_paths, raw_path, current_batch_id, ensure_adls_auth
from kflow.display_utils import show_history

from pyspark.sql.types import (StructType, StructField, StringType, IntegerType,
                               ArrayType, MapType)

ensure_adls_auth()

# Load Bronze paths
P            = bronze_paths("feedback")
BRONZE_TABLE = P.table
RAW_PATH     = raw_path("feedback")
BATCH_ID     = current_batch_id()

In [None]:
# 1. Define explicit JSON schema for performance & type safety.
feedback_schema = StructType([
    StructField("feedback_id",        StringType(), True),
    StructField("provider_id",        StringType(), True),
    StructField("timestamp",          StringType(), True),
    StructField("visit_id",           StringType(), True),
    StructField("satisfaction_score", IntegerType(), True),
    StructField("comments",           StringType(), True),
    StructField("source",             StringType(), True),
    StructField("tags",               ArrayType(StringType()), True),
    StructField("metadata",           MapType(StringType(), StringType()), True),
])

# Register the schema for use in SQL (as a DDL string)
schema_ddl = feedback_schema.simpleString().replace("struct<", "").rstrip(">")

In [None]:
# 1. Ensure Bronze Feedback table exists
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {BRONZE_TABLE} (
  feedback_id        STRING,
  provider_id        STRING,
  timestamp          STRING,
  visit_id           STRING,
  satisfaction_score INT,
  comments           STRING,
  source             STRING,
  tags               ARRAY<STRING>,
  metadata_json      STRING,
  _ingest_ts         TIMESTAMP,
  _source_file       STRING,
  _batch_id          STRING
)
USING DELTA
LOCATION '{P.bronze}'
TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

In [None]:
# 2. Run batch operation
#    COPY INTO scans the entire source path each run
spark.sql(f"""
COPY INTO {BRONZE_TABLE}
FROM (
  SELECT
    CAST(feedback_id        AS STRING)            AS feedback_id,
    CAST(provider_id        AS STRING)            AS provider_id,
    CAST(timestamp          AS STRING)            AS timestamp,
    CAST(visit_id           AS STRING)            AS visit_id,
    CAST(satisfaction_score AS INT)               AS satisfaction_score,
    CAST(comments           AS STRING)            AS comments,
    CAST(source             AS STRING)            AS source,
    CAST(tags               AS ARRAY<STRING>)     AS tags,
    to_json(metadata)                              AS metadata_json,
    current_timestamp()                            AS _ingest_ts,
    input_file_name()                              AS _source_file,
    '{BATCH_ID}'                                    AS _batch_id
  FROM '{RAW_PATH}'
)
FILEFORMAT = JSON
FORMAT_OPTIONS ('multiLine' = 'false')
COPY_OPTIONS   ('mergeSchema' = 'false')
""")

In [None]:
# 4. Batch finished. Verify Bronze Feedback table and history
df = spark.table(BRONZE_TABLE)
print(f"Bronze Feedback row count: {df.count():,}")
display(df.limit(5))
show_history(P.bronze)