In [0]:
# bronze_claims_autoloader.ipynb
# SOURCE: Raw Parquet files in ADLS
# TARGET: `kardia_bronze.bronze_claims` (CDF)
# TRIGGER: Incremental batch via Auto Loader; append to Bronze Claims table
# NOTE: For self‑describing formats like Parquet, we use schema inference and enforce types in Silver.

# On ephemeral Jobs clusters we install kflow per-notebook.
# Uncomment the next line if running the notebook as a Job.
# %pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow

import pyspark.sql.functions as F

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import BRONZE_DB, bronze_paths
from kflow.display_utils import show_history
from kflow.etl_utils import add_audit_cols

# Configure Spark with ADLS OAuth credentials and return base ABFS path
abfss_base = ensure_adls_oauth()

# Set catalog to Hive Metastore (required when not using Unity Catalog)
spark.sql("USE CATALOG hive_metastore")

# Load table paths and names for the Claims dataset (paths, table, schema, etc.)
P = bronze_paths("claims")
BRONZE_TABLE = P.table

In [0]:
# 2. Ensure Bronze DB and Claims table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE}
    USING DELTA
    COMMENT 'Bronze Parquet ingest of claim records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 3. Define an incremental batch pipeline using Auto-loader.
#    Auto Loader discovers new files in cloud storage and writes to a Delta table.
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "parquet")
         .option("cloudFiles.includeExistingFiles", "true")
         .option("cloudFiles.schemaLocation", P.schema)
         .option("badRecordsPath", P.bad)
         .load(P.raw)
         # Drop any records without a valid primary key
         .filter(F.col("ClaimID").isNotNull())
         # Add ingest timestamp, source file, batch ID
         .transform(add_audit_cols)
         
         .writeStream
         .option("checkpointLocation", P.checkpoint)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

In [0]:
# 4. Batch finished - Verify Bronze Claims table and ingestion history.
df = spark.table(BRONZE_TABLE)
print(f"Bronze Claims row count: {df.count():,}")
display(df.limit(5))
show_history(P.bronze)