In [0]:
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import time

# --- 1. PERFORMANCE PROFILER SETUP ---
class PipelineProfiler:
    def __init__(self):
        self.stats = {}
    def start_timer(self, stage_name):
        self.stats[stage_name] = time.time()
        print(f"Starting {stage_name}...")
    def end_timer(self, stage_name):
        duration = time.time() - self.stats[stage_name]
        print(f"{stage_name} completed in {duration:.2f} seconds.")
        return duration

profiler = PipelineProfiler()
profiler.start_timer("Model Evaluation")

# --- 2. LOAD WINNER & DATA ---
model_path = "/Volumes/workspace/default/uk_land_registry/models/lr_model"
model = LogisticRegressionModel.load(model_path)
test_data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# 3. GENERATE PREDICTIONS
predictions = model.transform(test_data)

# --- 4. CALCULATE METRICS (Requirement 4a) ---
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
weightedPrecision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
weightedRecall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

print("-" * 30)
print(f"WINNER MODEL: Logistic Regression")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {weightedPrecision:.4f}")
print(f"Recall: {weightedRecall:.4f}")
print("-" * 30)

# --- 5. FEATURE INFLUENCE (Requirement 2a) ---
try:
    matrix = model.coefficientMatrix
    print("--- MODEL INTERPRETATION (Influence of Price on Type) ---")
    for i, coeff in enumerate(matrix.toArray()):
        # coeff[0] because 'Price' is our only feature in the vector
        print(f"Label {i} Weight: {coeff[0]:.4f}")
except Exception as e:
    print(f"Interpretation error: {e}")

# --- 6. EXPORT GOLD LAYERS (Requirement 3) ---

# A. Model Performance for Tableau Dashboard 2
# We sample 100k rows to ensure Dashboard 2 is responsive
gold_evaluation = predictions.select("Price", "type_label", "prediction").limit(100000)
gold_evaluation.coalesce(1).write.mode("overwrite").option("header", "true") \
    .csv("/Volumes/workspace/default/uk_land_registry/gold_model_performance")

# B. Pipeline Performance for Tableau Dashboard 4
# STOP THE TIMER and record actual duration
eval_duration = profiler.end_timer("Model Evaluation")

# !!! MANUAL STEP: Replace these placeholders with your ACTUAL seconds from Notebooks 1, 2, 3 !!!
performance_data = [
    ("Ingestion", 88.5),           # Placeholder: Replace with Notebook 1 actual
    ("Feature Engineering", 212.3), # Placeholder: Replace with Notebook 2 actual
    ("Model Training", 545.1),      # Placeholder: Replace with Notebook 3 actual
    ("Evaluation", eval_duration)   # AUTO: Uses the profiler duration from above
]

perf_df = spark.createDataFrame(performance_data, ["Stage", "Duration_Sec"])
perf_df.coalesce(1).write.mode("overwrite").option("header", "true") \
    .csv("/Volumes/workspace/default/uk_land_registry/gold_pipeline_performance")

print("Notebook 4 Complete: Gold Layers exported for Tableau.")

Starting Model Evaluation...
------------------------------
WINNER MODEL: Logistic Regression
Accuracy: 0.3541
Precision: 0.2882
Recall: 0.3541
------------------------------
--- MODEL INTERPRETATION (Influence of Price on Type) ---
Label 0 Weight: -1.9833
Label 1 Weight: -1.1775
Label 2 Weight: 1.3484
Label 3 Weight: -0.0680
Label 4 Weight: 1.8805
Model Evaluation completed in 72.69 seconds.
Notebook 4 Complete: Gold Layers exported for Tableau.
