In [0]:
# Bronze Ingest: Patients -> Bronze Delta Table  
# Reads `patients_10.csv` into a Delta Lake Bronze table with Change Data Feed enabled.

from pyspark.sql import SparkSession

In [0]:
# Path config
RAW_PATH = "dbfs:/FileStore/shared_uploads/matthew.databrickslab2@outlook.com/patients_10.csv"
BRONZE_PATH = "/mnt/kardia/bronze/bronze_patients"

In [0]:
# Initialize a Spark session with minimal shuffle partitions
spark = (
    SparkSession.builder
    .appName("bronze_patients")
    .config("spark.sql.shuffle.partitions", "1")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [0]:
# Read raw CSV (no schema inference → all STRING columns)
df = (
    spark.read
         .option("header", True)
         .option("inferSchema", False)
         .csv(RAW_PATH)
)
row_cnt = df.count()
print(f"Read {row_cnt} rows from {RAW_PATH}")

In [0]:
# Write to Bronze Delta with Change Data Feed enabled
(
    df.write
      .format("delta")
      .mode("overwrite")
      .option("delta.enableChangeDataFeed", "true")
      .save(BRONZE_PATH)
)
print(f"▶ Wrote Bronze Delta table to {BRONZE_PATH}")

In [0]:
# Preview the new Bronze table
df_bronze = spark.read.format("delta").load(BRONZE_PATH)
print("Columns:", df_bronze.columns)
display(df_bronze.limit(3))

In [0]:
print(f"Verifying Bronze Delta Table at: {BRONZE_PATH}\n")

# 1. Schema
print("Schema:")
df_bronze.printSchema()
print()

# 2. Underlying files
print("Underlying files in Delta folder:")
display(dbutils.fs.ls(BRONZE_PATH))

# 3. Recent Delta history
print("Recent Delta history:")
display(
    spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`")
         .select("version", "timestamp", "operation", "userName")
         .limit(5)
)

In [0]:
### Bronze ingest complete  
### Proceed to `make_cdf_view` to build Change Data Feed temp view.