# Bronze Ingestion Notebook

This notebook performs deterministic ingestion of raw source data into the Bronze layer of the Lakehouse. It enforces schema typing, applies minimal transformations, and writes append‑only Delta outputs suitable for downstream validation and enrichment. The Bronze layer preserves raw fidelity while ensuring consistent structure for all subsequent processing.

In [None]:
# ============================================================
# Bronze Ingestion Notebook
# Deterministic raw → bronze ingestion with schema enforcement
# ============================================================

# ------------------------------------------------------------
# 1. Imports
# ------------------------------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
)
from pyspark.sql.functions import current_timestamp

spark = SparkSession.builder.getOrCreate()

# ------------------------------------------------------------
# 2. Define canonical Bronze schema
#    (Replace fields as needed for your actual source)
# ------------------------------------------------------------
bronze_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("unit_price", DoubleType(), True),
    StructField("order_timestamp", TimestampType(), True)
])

# ------------------------------------------------------------
# 3. Define input and output paths
#    (ASCII‑safe placeholders; replace with real paths later)
# ------------------------------------------------------------
raw_input_path = "/mnt/raw/sales/"
bronze_output_path = "/mnt/bronze/sales/"

# ------------------------------------------------------------
# 4. Read raw data with enforced schema
# ------------------------------------------------------------
df_raw = (
    spark.read
         .format("json")
         .schema(bronze_schema)
         .load(raw_input_path)
)

# ------------------------------------------------------------
# 5. Add ingestion metadata
# ------------------------------------------------------------
df_bronze = (
    df_raw.withColumn("ingestion_timestamp", current_timestamp())
          .withColumn("source_file", current_timestamp().cast("string"))
)

# ------------------------------------------------------------
# 6. Write to Bronze as append‑only Delta
# ------------------------------------------------------------
(
    df_bronze.write
             .format("delta")
             .mode("append")
             .save(bronze_output_path)
)

# ------------------------------------------------------------
# 7. Confirmation
# ------------------------------------------------------------
print("Bronze ingestion completed successfully.")