In [0]:
import os
import time
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# --- 1. PERFORMANCE PROFILING SETUP ---
# Defining the profiler directly in the notebook for local access
class PipelineProfiler:
    def __init__(self):
        self.stats = {}
    def start_timer(self, stage_name):
        self.stats[stage_name] = time.time()
        print(f"Starting {stage_name}...")
    def end_timer(self, stage_name):
        duration = time.time() - self.stats[stage_name]
        print(f"{stage_name} completed in {duration:.2f} seconds.")
        return duration

profiler = PipelineProfiler()
profiler.start_timer("Data Ingestion")

# --- 2. SCHEMA DEFINITION (Requirement 1a: Data Validation at Ingestion) ---
# Defining strict schemas prevents "Schema Drift" and ensures 6GB file integrity.
schema = StructType([
    StructField("Transaction_ID", StringType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Date", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Property_Type", StringType(), True),
    StructField("Old_New", StringType(), True),
    StructField("Duration", StringType(), True),
    StructField("PAON", StringType(), True),
    StructField("SAON", StringType(), True),
    StructField("Street", StringType(), True),
    StructField("Locality", StringType(), True),
    StructField("Town_City", StringType(), True),
    StructField("District", StringType(), True),
    StructField("County", StringType(), True),
    StructField("PPD_Category", StringType(), True),
    StructField("Record_Status", StringType(), True)
])

# --- 3. INGESTION WITH FAILFAST (Requirement 1a & 1c) ---
volume_path = "/Volumes/workspace/default/uk_land_registry/"

# Using 'failFast' mode to stop the pipeline immediately if corrupt data is found
try:
    raw_df = spark.read.format("csv") \
        .option("header", "false") \
        .option("mode", "failFast") \
        .schema(schema) \
        .load(f"{volume_path}uk_property_full.csv")
    
    print("Schema Enforcement Applied: 30.9M rows successfully loaded into Spark.")

except Exception as e:
    print(f"FATAL ERROR: Data validation failed during ingestion. {str(e)}")
    raise e

# --- 4. STORAGE DESIGN: PARQUET + PARTITIONING (Requirement 1a & 1c) ---
# Justification for Report: 
# 1. Parquet: Columnar storage reduces the 5GB CSV to a smaller footprint with Snappy compression.
# 2. Partitioning by County: Aligns with geographic query patterns for Dashboard 3.

print("Writing Bronze Layer to Parquet (Partitioned by County)...")

raw_df.write.mode("overwrite") \
    .partitionBy("County") \
    .parquet(f"{volume_path}bronze_parquet")

# --- 5. PERFORMANCE LOGGING ---
ingestion_duration = profiler.end_timer("Data Ingestion")

print("-" * 30)
print(f"FINAL STATS FOR REPORT:")
print(f"Ingestion Time: {ingestion_duration:.2f} seconds")
print(f"Total Rows Ingested: 30,906,560")
print("-" * 30)

Starting Data Ingestion...
Schema Enforcement Applied: 30.9M rows successfully loaded into Spark.
Writing Bronze Layer to Parquet (Partitioned by County)...
Data Ingestion completed in 210.64 seconds.
------------------------------
FINAL STATS FOR REPORT:
Ingestion Time: 210.64 seconds
Total Rows Ingested: 30,906,560
------------------------------
