In [0]:
# ==========================================
# 1. IMPORTS & PROFILER SETUP
# ==========================================
import sys
import os
import time
import pandas as pd
from pyspark.ml import Transformer
from pyspark.ml.util import DefaultParamsWritable, DefaultParamsReadable
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.sql import functions as F
from pyspark.sql.functions import col, to_timestamp, year, lit, broadcast

class PipelineProfiler:
    def __init__(self):
        self.stats = {}
    def start_timer(self, stage_name):
        self.stats[stage_name] = time.time()
        print(f"Starting {stage_name}...")
    def end_timer(self, stage_name):
        duration = time.time() - self.stats[stage_name]
        print(f"{stage_name} completed in {duration:.2f} seconds.")
        return duration

profiler = PipelineProfiler()
profiler.start_timer("Feature Engineering")

# ==========================================
# 2. CUSTOM TRANSFORMER (Requirement 2a)
# ==========================================
class PriceSegmenter(Transformer, DefaultParamsWritable, DefaultParamsReadable):
    """Requirement 2a: Adds domain-specific Market Segmentation."""
    def _transform(self, dataset):
        return dataset.withColumn("Market_Segment", 
            F.when(F.col("Price") < 150000, "Budget")
             .when(F.col("Price") < 450000, "Standard")
             .otherwise("Premium"))

# ==========================================
# 3. DATA INGESTION & LINEAGE (Requirement 1b)
# ==========================================
try:
    df = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/bronze_parquet")
    
    silver_df = df.withColumn("source_file", lit("uk_property_full.csv")) \
                  .withColumn("ingestion_layer", lit("Bronze")) \
                  .select(
                      col("Price").cast("double"),
                      to_timestamp(col("Date"), "yyyy-MM-dd HH:mm").alias("Sale_Date"),
                      "Property_Type", "Old_New", "Town_City", "source_file"
                  ).dropna()

    silver_df = silver_df.withColumn("Sale_Year", year(col("Sale_Date")))
    print("Bronze data loaded with lineage.")

except Exception as e:
    print(f"PIPELINE ERROR: {str(e)}")
    raise e

# ==========================================
# 4. DISTRIBUTED PROCESSING (Requirement 1b)
# ==========================================
mapping_data = [("D", "Detached"), ("S", "Semi-Detached"), ("T", "Terraced"), 
                ("P", "Flats/Maisonettes"), ("O", "Other")]
type_mapping_df = spark.createDataFrame(mapping_data, ["Property_Type", "Type_Description"])

silver_df_with_labels = silver_df.join(broadcast(type_mapping_df), on="Property_Type", how="left")

# ==========================================
# 5. ML PREPARATION & GEOGRAPHIC ENCODING
# ==========================================
# Target Indexing
type_indexer = StringIndexer(inputCol="Property_Type", outputCol="type_label")
indexed_df = type_indexer.fit(silver_df_with_labels).transform(silver_df_with_labels)

# NEW: Geographic Indexing (Requirement 2a)
city_indexer = StringIndexer(inputCol="Town_City", outputCol="city_label", handleInvalid="skip")
indexed_df = city_indexer.fit(indexed_df).transform(indexed_df)

# Custom Segmentation
segmented_df = PriceSegmenter().transform(indexed_df)

# Scaling Logic
price_assembler = VectorAssembler(inputCols=["Price"], outputCol="unscaled_price")
price_assembled_df = price_assembler.transform(segmented_df)

scaler = StandardScaler(inputCol="unscaled_price", outputCol="scaled_features", 
                        withStd=True, withMean=True)
scaled_df = scaler.fit(price_assembled_df).transform(price_assembled_df)

# FINAL VECTOR: Combines Scaled Price + City Label
final_assembler = VectorAssembler(inputCols=["scaled_features", "city_label"], outputCol="final_features")
final_engineered_df = final_assembler.transform(scaled_df)

# ==========================================
# 6. STORAGE (Requirement 1a & 1c)
# ==========================================
# Save to Parquet (Supports Vectors) for Notebooks 3 & 4
output_path = "/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet"
final_engineered_df.write.mode("overwrite").parquet(output_path)

# ==========================================
# 7. EXPORT FIXED 100K SAMPLE (For Tableau)
# ==========================================
# We cast Vectors to Strings to avoid [UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE] error
print("Exporting clean 100k sample for Tableau...")
tableau_export_df = final_engineered_df.select(
    "*",
    col("unscaled_price").cast("string").alias("unscaled_price_str"),
    col("scaled_features").cast("string").alias("scaled_features_str"),
    col("final_features").cast("string").alias("final_features_str")
).drop("unscaled_price", "scaled_features", "final_features")

# coalesce(1) ensures we get exactly one 100,000 row CSV file
tableau_export_df.limit(100000).coalesce(1).write.mode("overwrite").option("header", "true") \
    .csv("/Volumes/workspace/default/uk_land_registry/gold_tableau_data")

# GITHUB SAMPLE (1,000 rows)
sample_path = "/Volumes/workspace/default/uk_land_registry/github_samples"
tableau_export_df.limit(1000).toPandas().to_csv(f"{sample_path}/silver_sample.csv", index=False)

eng_duration = profiler.end_timer("Feature Engineering")
print(f"--- Notebook 2 Complete: 100k sample generated ---")

Starting Feature Engineering...
Bronze data loaded with lineage.
Exporting clean 100k sample for Tableau...
Feature Engineering completed in 88.02 seconds.
--- Notebook 2 Complete: 100k sample generated ---
