In [0]:
# 01_bronze_jdbc_providers.ipynb
# SOURCE:  Docker‑local Postgres table `providers`
# OUTPUT:  `kardia_bronze.bronze_providers` with Change Data Feed enabled
# PATTERN: Incremental batch; append to Delta table with fixed schema.

import os
from pyspark.sql import functions as F

# Table paths
BRONZE_DB         = "kardia_bronze"
BRONZE_PROV_TABLE = f"{BRONZE_DB}.bronze_providers"
BRONZE_PATH       = "dbfs:/kardia/bronze/bronze_providers"

# JDBC connection (local container started by init script)
JDBC_URL = "jdbc:postgresql://localhost:5432/postgres"
PG_USER  = "postgres"
PG_PW = os.environ["POSTGRES_PW"]

In [0]:
# 1. Ensure database / table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_PROV_TABLE}
    USING DELTA
    COMMENT 'Bronze JDBC ingest of provider reference data.'
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
    """
)

In [0]:
# 2. Read Postgres snapshot
provider_df = (spark.read
                    .format("jdbc")
                    .option("url",      JDBC_URL)
                    .option("dbtable",  "providers")
                    .option("user",     PG_USER)
                    .option("password", PG_PW)
                    .load()
                    .withColumn("_ingest_ts", F.current_timestamp())
              )

In [0]:
# 3. Append into Bronze
(provider_df.write
            .format("delta")
            .option("overwriteSchema", "true")
            .mode("append")
            .saveAsTable(BRONZE_PROV_TABLE))

print(f"Bronze ingest complete: Postgres → {BRONZE_PROV_TABLE}")
print(f"Row count: {spark.table(BRONZE_PROV_TABLE).count()}")

history_df = (spark.sql(f"DESCRIBE HISTORY {BRONZE_PROV_TABLE}")
                   .select("version","timestamp","operation"))
display(history_df.limit(3))