In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import current_timestamp, col

def get_bronze_spark_session():
    """Initializes Spark Session for Unity Catalog and Serverless safety."""
    return SparkSession.builder \
        .appName("UK_Property_Bronze_UC") \
        .config("spark.sql.adaptive.enabled", "true") \
        .getOrCreate()

def get_land_registry_schema():
    """Explicit schema to prevent OOM on 6GB file."""
    return StructType([
        StructField("TUID", StringType(), True),
        StructField("Price", DoubleType(), True),
        StructField("Transfer_Date", StringType(), True),
        StructField("Postcode", StringType(), True),
        StructField("Property_Type", StringType(), True),
        StructField("Old_New", StringType(), True),
        StructField("Duration", StringType(), True),
        StructField("PAON", StringType(), True),
        StructField("SAON", StringType(), True),
        StructField("Street", StringType(), True),
        StructField("Locality", StringType(), True),
        StructField("Town_City", StringType(), True),
        StructField("District", StringType(), True),
        StructField("County", StringType(), True),
        StructField("PPD_Category", StringType(), True),
        StructField("Record_Status", StringType(), True)
    ])

def run_bronze_ingestion():
    """UC-Compliant ingestion using _metadata instead of input_file_name."""
    spark = get_bronze_spark_session()
    
    input_path = "/Volumes/workspace/default/uk_land_registry/uk_property_full.csv"
    output_path = "/Volumes/workspace/default/uk_land_registry/bronze_parquet"
    
    # 1. Read with explicit schema
    df_raw = spark.read.csv(
        input_path, 
        header=False, 
        schema=get_land_registry_schema()
    )
    
    # 2. Add Lineage (Using _metadata.file_path for Unity Catalog compliance)
    df_bronze = df_raw.withColumn("ingest_timestamp", current_timestamp()) \
                      .withColumn("source_file", col("_metadata.file_path"))
    
    # 3. Save as Parquet (Optimized for Medallion architecture)
    # Using coalesce(1) to ensure a single file for the initial audit trail
    df_bronze.write.mode("overwrite").parquet(output_path)
    
    return df_bronze

# EXECUTE
df_bronze_final = run_bronze_ingestion()
print(f"Bronze layer deployed to UC. Record count: {df_bronze_final.count()}")

Bronze layer deployed to UC. Record count: 30906560
