In [0]:
# 01_bronze_jdbc_providers.ipynb
# SOURCE:  Docker‑local Postgres table `providers`
# OUTPUT:  `kardia_bronze.bronze_providers` with Change Data Feed enabled
# PATTERN: Incremental batch; append to Delta table with fixed schema.

import os
from pyspark.sql import functions as F

# Table paths
BRONZE_DB         = "kardia_bronze"
BRONZE_PROV_TABLE = f"{BRONZE_DB}.bronze_providers"
BRONZE_PATH       = "dbfs:/kardia/bronze/bronze_providers"

# JDBC connection (local container started by init script)
JDBC_URL = "jdbc:postgresql://localhost:5432/postgres"
PG_USER  = "postgres"
PG_PW    = dbutils.secrets.get("kardia", "pg_pw")

In [0]:
# 1. Ensure Bronze DB and Providers table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_PROV_TABLE}
    USING DELTA
    COMMENT 'Bronze JDBC ingest of provider reference data.'
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
    """
)

In [0]:
# 2. Read Postgres snapshot and append into Bronze Providers table.

#    Snapshot read of the entire table.
#    In production, use incremental logic (WHERE updated_at > last_ingested)
provider_df = (spark.read
                    .format("jdbc")
                    .option("url",      JDBC_URL)
                    .option("dbtable",  "providers")
                    .option("user",     PG_USER)
                    .option("password", PG_PW)
                    .load()
                    .withColumn("_ingest_ts", F.current_timestamp())
              )
(provider_df.write
            .format("delta")
            .option("mergeSchema", "true")
            .mode("append")
            .saveAsTable(BRONZE_PROV_TABLE))

print(f"Bronze ingest complete from Postgres to {BRONZE_PROV_TABLE}")

# NOTE: Because we transform the DataFrame, we cannot chain .write directly onto .read.

In [0]:
# 4. Quick sanity check.
print(f"Row count: {spark.table(BRONZE_PROV_TABLE).count()}")
display(spark.table(BRONZE_PROV_TABLE).limit(10))

history_df = (spark.sql(f"DESCRIBE HISTORY {BRONZE_PROV_TABLE}")
                   .select("version","timestamp","operation"))
display(history_df.limit(3))