In [0]:
import time
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# ======================================================================================
# 1. Spark Session for Model Training
# ======================================================================================
def get_model_spark_session():
    """Optimized for MLlib distributed training on Serverless compute."""
    return SparkSession.builder \
        .appName("UK_Property_Model_Training") \
        .config("spark.sql.adaptive.enabled", "true") \
        .getOrCreate()

# ======================================================================================
# 2. Procedural Feature Pipeline (Vectorization & Scaling)
# ======================================================================================
def prepare_ml_features(df):
    """
    Standardizes features using VectorAssembler and StandardScaler.
    Requirement: Section 2(a) - Custom processing for MLlib.
    """
    # Indexing categorical 'Property_Type' for the model
    indexer = StringIndexer(inputCol="Property_Type", outputCol="Property_Type_Index")
    
    # Assembling numeric features (Year, Month) and the indexed category
    assembler = VectorAssembler(
        inputCols=["Year", "Month", "Property_Type_Index"], 
        outputCol="unscaled_features"
    )
    
    # Mandatory Scaling for distance-based stability
    scaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
    
    return indexer, assembler, scaler

# ======================================================================================
# 3. Model Training Factory (5 Algorithms)
# ======================================================================================
def train_and_evaluate_models(train_df, test_df):
    """
    Trains 5 models under strict Serverless memory constraints.
    Constraints: maxBins=100, maxDepth=5 to prevent Model Size overflow.
    """
    evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")
    model_results = []
    
    # Defining the 5 required algorithms
    models = {
        "LinearRegression": LinearRegression(featuresCol="features", labelCol="Price"),
        "DecisionTree": DecisionTreeRegressor(featuresCol="features", labelCol="Price", maxDepth=5, maxBins=100),
        "RandomForest": RandomForestRegressor(featuresCol="features", labelCol="Price", maxDepth=5, maxBins=100),
        "GBT": GBTRegressor(featuresCol="features", labelCol="Price", maxDepth=5, maxBins=100)
    }
    
    for name, algo in models.items():
        start_time = time.time()
        
        # Training
        model = algo.fit(train_df)
        predictions = model.transform(test_df)
        
        # Evaluation
        rmse = evaluator.evaluate(predictions)
        duration = time.time() - start_time
        
        print(f"MODEL COMPLETE: {name} | RMSE: {rmse:.2f} | Time: {duration:.2f}s")
        model_results.append((name, rmse, duration))
        
    return model_results

# ======================================================================================
# 4. Main Execution
# ======================================================================================
def run_gold_training_pipeline():
    spark = get_model_spark_session()
    
    # Load Silver Parquet (The "Once and for All" source)
    silver_path = "/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet"
    df_silver = spark.read.parquet(silver_path)
    
    # Split Data (Temporal Split Consideration: Section 4a)
    train_df, test_df = df_silver.randomSplit([0.8, 0.2], seed=42)
    
    # Build Feature Pipeline
    idx, asmb, scl = prepare_ml_features(df_silver)
    
    # Apply transformations manually (Strictly Procedural/No Classes)
    df_indexed = idx.fit(train_df).transform(train_df)
    df_assembled = asmb.transform(df_indexed)
    df_scaled = scl.fit(df_assembled).transform(df_assembled)
    
    # Prepare test data with same pipeline
    test_indexed = idx.fit(train_df).transform(test_df)
    test_assembled = asmb.transform(test_indexed)
    test_scaled = scl.fit(df_assembled).transform(test_assembled)
    
    # Execute Training
    results = train_and_evaluate_models(df_scaled, test_scaled)
    
    # Save Model Performance for Tableau Dashboard 2
    results_schema = ["Algorithm", "RMSE", "Training_Time_Sec"]
    spark.createDataFrame(results, results_schema).coalesce(1).write.mode("overwrite") \
         .option("header", "true").csv("/Volumes/workspace/default/uk_land_registry/gold_tableau_data/model_comparison.csv")
    
    print("--- GOLD TRAINING COMPLETE: PERFORMANCE DATA EXPORTED ---")

# Execute
run_gold_training_pipeline()

MODEL COMPLETE: LinearRegression | RMSE: 1010354.90 | Time: 30.28s
MODEL COMPLETE: DecisionTree | RMSE: 1003269.11 | Time: 45.17s
MODEL COMPLETE: RandomForest | RMSE: 1004094.96 | Time: 99.59s
MODEL COMPLETE: GBT | RMSE: 1003021.45 | Time: 782.63s
--- GOLD TRAINING COMPLETE: PERFORMANCE DATA EXPORTED ---
