In [0]:
import time
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, to_date, current_timestamp, lit

# ======================================================================================
# [cite_start]1. Spark Session Configuration [cite: 75, 308]
# ======================================================================================
def get_silver_spark_session():
    """
    Configures Spark for Serverless.
    Manual persist/cache is replaced by AQE (Adaptive Query Execution).
    Justification: Photon engine handles caching internally on Serverless.
    """
    return SparkSession.builder \
        .appName("UK_Property_Silver_Production") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.shuffle.partitions", "auto") \
        .getOrCreate()

# ======================================================================================
# [cite_start]2. Performance Profiling & Data Lineage [cite: 83, 316, 82, 315]
# ======================================================================================
def profile_and_validate(step_name, df_operation, *args, **kwargs):
    """
    Tracks execution time and record counts for Scalability Analysis.
    Exports data used for Tableau Dashboard 4: Scalability and Cost analysis.
    """
    start_time = time.time()
    
    # Execute transformation
    df = df_operation(*args, **kwargs)
    
    # Triggers execution on Serverless to capture real-time performance metrics
    row_count = df.count()
    
    duration = time.time() - start_time
    print(f"STEP: {step_name} | Duration: {duration:.2f}s | Records: {row_count}")
    
    # Metadata for Ingestion Audit and Lineage
    df_final = df.withColumn("pipeline_step", lit(step_name)) \
                 .withColumn("processed_timestamp", current_timestamp())
                 
    return df_final, duration, row_count

# ======================================================================================
# [cite_start]3. Distributed Processing (mapInPandas) [cite: 79, 312]
# ======================================================================================
def apply_serverless_parallel_cleaning(iterator):
    """
    Bypasses RDD restrictions while maintaining O(n/p) complexity logic.
    Justification: RDDs are deprecated on Serverless; mapInPandas is the modern standard.
    """
    for pdf in iterator:
        # Data validation at the partition level: Price must be positive
        # This addresses Section 1.7 (Critical Evaluation of Data Quality)
        cleaned_pdf = pdf[(pdf['Price'] > 0) & (pdf['County'].notnull())]
        yield cleaned_pdf

# ======================================================================================
# 4. Main Integrated Execution
# ======================================================================================
def run_full_silver_pipeline():
    spark = get_silver_spark_session()
    perf_metrics = []
    
    # [cite_start]--- A. Data Ingestion (Storage Design) [cite: 77, 310] ---
    input_path = "/Volumes/workspace/default/uk_land_registry/bronze_parquet"
    df_bronze, t_load, c_load = profile_and_validate("Ingest_Bronze_Parquet", spark.read.parquet, input_path)
    perf_metrics.append(("Ingestion", t_load, c_load))
    
    # --- B. Feature Engineering (Temporal Extraction) ---
    # Essential for 'Dashboard 3: Business insights' 30-year trend line
    df_temp = df_bronze.withColumn("Transfer_Date", to_date(col("Transfer_Date"), "yyyy-MM-dd HH:mm")) \
                       .withColumn("Year", year(col("Transfer_Date"))) \
                       .withColumn("Month", month(col("Transfer_Date")))
    
    # --- C. Parallel Processing (Computational Complexity) ---
    # Demonstrates distributed performance optimization without RDD overhead
    df_silver, t_clean, c_clean = profile_and_validate(
        "Parallel_Cleaning_mapInPandas", 
        df_temp.mapInPandas, 
        apply_serverless_parallel_cleaning, 
        df_temp.schema
    )
    perf_metrics.append(("Cleaning_O_n_p", t_clean, c_clean))
    
    # [cite_start]--- D. Tableau Strategy (Sampling) [cite: 105, 338] ---
    # Using 0.0033 to reduce 30.9M rows to ~100k for Tableau performance
    df_silver_sample, t_samp, c_samp = profile_and_validate(
        "Tableau_Sampling", 
        df_silver.sample, 
        False, 0.0033, 42
    )
    perf_metrics.append(("Sampling_for_Tableau", t_samp, c_samp))
    
    # [cite_start]--- E. Storage & Partitioning Strategy [cite: 76, 309] ---
    # Partitioning by County aligns with query patterns in Gold layer modeling
    output_path = "/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet"
    df_silver.write.mode("overwrite").partitionBy("County").parquet(output_path)
    
    # Export Scalability Metrics for Dashboard 4
    metrics_schema = ["step_name", "execution_time_sec", "record_count"]
    spark.createDataFrame(perf_metrics, metrics_schema).coalesce(1).write.mode("overwrite") \
         .option("header", "true").csv("/Volumes/workspace/default/uk_land_registry/gold_tableau_data/performance_log.csv")
         
    # Export Sample for Dashboard 3
    df_silver_sample.coalesce(1).write.mode("overwrite").option("header", "true") \
                    .csv("/Volumes/workspace/default/uk_land_registry/gold_tableau_data/silver_sample_tableau.csv")

    print("--- SERVERLESS SILVER PIPELINE COMPLETE (METRICS LOGGED) ---")

# Trigger
run_full_silver_pipeline()

STEP: Ingest_Bronze_Parquet | Duration: 2.50s | Records: 30906560
STEP: Parallel_Cleaning_mapInPandas | Duration: 80.04s | Records: 30906560
STEP: Tableau_Sampling | Duration: 62.47s | Records: 101976
--- SERVERLESS SILVER PIPELINE COMPLETE (METRICS LOGGED) ---
