In [0]:
# 01_bronze_feedback_copy_into.ipynb
# SOURCE:  JSON‑lines feedback files in ADLS at abfss://raw@kardiaadlsdemo.dfs.core.windows.net/feedback/  
# OUTPUT: `kardia_bronze.bronze_feedback` with Change Data Feed enabled  
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled  

# NOTE: Using Auto Loader for JSONL; schema evolves automatically via schemaLocation.

from kflow.adls import set_sas
from kflow.config import bronze_paths, adls_raw_path
from kflow.display_utils import banner, show_history, show_head
from kflow.etl_utils import add_audit_cols

import pyspark.sql.functions as F
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType,
                               ArrayType, MapType)

# Auth to ADLS
ACCOUNT = "kardiaadlsdemo"
sas = dbutils.secrets.get("kardia", "adls_raw_sas")
set_sas(ACCOUNT, sas)

# Load Bronze paths
P            = bronze_paths("feedback")
BRONZE_TABLE = P.table
RAW_PATH     = adls_raw_path("feedback")

In [0]:
# 2. Define explicit JSON schema for performance & type safety.
feedback_schema = StructType([
    StructField("feedback_id",        StringType(), True),
    StructField("provider_id",        StringType(), True),
    StructField("timestamp",          StringType(), True),
    StructField("visit_id",           StringType(), True),
    StructField("satisfaction_score", IntegerType(), True),
    StructField("comments",           StringType(), True),
    StructField("source",             StringType(), True),
    StructField("tags",               ArrayType(StringType()), True),
    StructField("metadata",           MapType(StringType(), StringType()), True),
])

In [0]:
# 3. Ensure Bronze Feedback table exists
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE} (
      feedback_id        STRING,
      provider_id        STRING,
      timestamp          STRING,
      visit_id           STRING,
      satisfaction_score INT,
      comments           STRING,
      source             STRING,
      tags               ARRAY<STRING>,
      metadata           MAP<STRING,STRING>,
      _ingest_ts         TIMESTAMP,
      _source_file       STRING,
      _batch_id          STRING
    )
    USING DELTA
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 4. Define incremental Auto-loader stream for JSONL files.
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "json")
         .option("cloudFiles.schemaLocation", P.schema)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", P.bad)
         .schema(feedback_schema)
         .load(RAW_PATH)
         .transform(add_audit_cols)
         
         .writeStream
         .option("checkpointLocation", P.checkpoint)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

In [0]:
# 5. Batch finished – Verify Bronze Feedback table and ingestion history.
df = spark.table(BRONZE_TABLE)
banner(f"Bronze Feedback row count: {df.count()}", ok=True)
show_head(df, 5)
show_history(P.bronze, 5)