In [0]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import time

# --- 1. PERFORMANCE PROFILER SETUP ---
class PipelineProfiler:
    def __init__(self):
        self.stats = {}
    def start_timer(self, stage_name):
        self.stats[stage_name] = time.time()
        print(f"Starting {stage_name}...")
    def end_timer(self, stage_name):
        duration = time.time() - self.stats[stage_name]
        print(f"{stage_name} completed in {duration:.2f} seconds.")
        return duration

profiler = PipelineProfiler()
profiler.start_timer("Model Evaluation")

# --- 2. LOAD ML-READY DATA ---
# Using the Silver layer which already has 'final_features' from Notebook 2
eval_data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# --- 3. LOAD WINNER (Random Forest) & GENERATE PREDICTIONS ---
# Changed path to load the actual winner from Notebook 3
model_path = "/Volumes/workspace/default/uk_land_registry/models/rf_model"
model = RandomForestClassificationModel.load(model_path)

# Apply model to test data
predictions = model.transform(eval_data)

# --- 4. CALCULATE METRICS (Requirement 4a) ---
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
weightedPrecision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
weightedRecall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

print("-" * 30)
print(f"WINNER MODEL: Random Forest (Optimized with City Context)")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {weightedPrecision:.4f}")
print(f"Recall: {weightedRecall:.4f}")
print("-" * 30)

# --- 5. MODEL INTERPRETATION (Feature Importance) ---
# Random Forest uses 'featureImportances' instead of coefficients
try:
    importances = model.featureImportances
    print("--- MODEL INTERPRETATION (Feature Importance) ---")
    # index 0 is Price, index 1 is City_Label
    print(f"Price Influence: {importances[0]:.4f}")
    print(f"City/Location Influence: {importances[1]:.4f}")
    
    dominant = "Price" if importances[0] > importances[1] else "City/Location"
    print(f"Analysis: {dominant} was the primary factor in determining property type.")
except Exception as e:
    print(f"Interpretation error: {e}")

# --- 6. EXPORT GOLD LAYERS (Requirement 3) ---

# A. Model Performance for Tableau Dashboard 2
gold_evaluation = predictions.select("Price", "type_label", "prediction").limit(100000)
gold_evaluation.coalesce(1).write.mode("overwrite").option("header", "true") \
    .csv("/Volumes/workspace/default/uk_land_registry/gold_model_performance")

# B. Pipeline Performance for Tableau Dashboard 4
eval_duration = profiler.end_timer("Model Evaluation")

# Synced with your Master Pipeline execution times
performance_data = [
    ("Ingestion", 170.05),          
    ("Feature Engineering", 66.52), 
    ("Model Training", 204.57),     
    ("Evaluation", eval_duration)   
]

perf_df = spark.createDataFrame(performance_data, ["Stage", "Duration_Sec"])
perf_df.coalesce(1).write.mode("overwrite").option("header", "true") \
    .csv("/Volumes/workspace/default/uk_land_registry/gold_pipeline_performance")

print("Notebook 4 Complete: Gold Layers exported with Winning RF Model results.")

Starting Model Evaluation...
------------------------------
WINNER MODEL: Random Forest (Optimized with City Context)
Accuracy: 0.4263
Precision: 0.4495
Recall: 0.4263
------------------------------
--- MODEL INTERPRETATION (Feature Importance) ---
Price Influence: 0.5202
City/Location Influence: 0.4798
Analysis: Price was the primary factor in determining property type.
Model Evaluation completed in 96.27 seconds.
Notebook 4 Complete: Gold Layers exported with Winning RF Model results.
