In [0]:
from pyspark.ml.classification import DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load the Best Model and Test Data
model = DecisionTreeClassificationModel.load("/Volumes/workspace/default/uk_land_registry/models/best_dt_model")
test_data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# 2. Generate Predictions
predictions = model.transform(test_data)

# 3. Calculate Key Metrics (Requirement 4a)
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
weightedPrecision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
weightedRecall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

print(f"Model Accuracy: {accuracy:.4f}")
print(f"Weighted Precision: {weightedPrecision:.4f}")
print(f"Weighted Recall: {weightedRecall:.4f}")

# 4. Feature Importance (Requirement 2a - 'Feature Importance Plots')
# This shows which variable had the most 'Information Gain' (Entropy reduction)
importance = model.featureImportances
print(f"Feature Importance (Price vs Categories): {importance}")

# 5. Export Final Gold Data for Dashboard 2 (Model Performance)
# We export a sample of predictions so Tableau can show the 'Confusion'
gold_evaluation = predictions.select("Price", "type_label", "prediction").limit(100000)
gold_evaluation.coalesce(1).write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/default/uk_land_registry/gold_model_performance")

print("Notebook 4 Complete: Metrics calculated and Gold Layer 2 exported.")

Model Accuracy: 0.3725
Weighted Precision: 0.3037
Weighted Recall: 0.3725
Feature Importance (Price vs Categories): (1,[0],[1.0])
Notebook 4 Complete: Metrics calculated and Gold Layer 2 exported.
