In [0]:
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load the WINNING Model (Logistic Regression) and Test Data
# We point it to the folder we saved in Notebook 3
model = LogisticRegressionModel.load("/Volumes/workspace/default/uk_land_registry/models/lr_model")
test_data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# 2. Generate Predictions
predictions = model.transform(test_data)

# 3. Calculate Key Metrics (Requirement 4a)
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
weightedPrecision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
weightedRecall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

print(f"Winner Model (Logistic Regression) Accuracy: {accuracy:.4f}")
print(f"Weighted Precision: {weightedPrecision:.4f}")
print(f"Weighted Recall: {weightedRecall:.4f}")

# --- 4. Feature Influence (Requirement 2a - Interpretation) ---
# For Multiclass Logistic Regression, we use the Matrix

try:
    # coefficientMatrix gives us a row for each label (Detached, Flat, etc.)
    # and a column for each feature (in our case, just Price)
    matrix = model.coefficientMatrix
    intercepts = model.interceptVector
    
    print("--- MODEL INTERPRETATION DATA ---")
    print(f"Coefficient Matrix (Rows=Labels, Cols=Features):\n{matrix}")
    print(f"Intercept Vector per Class:\n{intercepts}")

    # To make it report-ready:
    for i, coeff in enumerate(matrix.toArray()):
        print(f"Property Type {i} (Label {i}) influence by Price: {coeff[0]:.4f}")

except Exception as e:
    print(f"Could not extract coefficients directly: {str(e)}")

# 5. Export Final Gold Data for Dashboard 2 (Model Performance)
# We keep this as 100k rows so Tableau doesn't lag.
gold_evaluation = predictions.select("Price", "type_label", "prediction").limit(100000)
gold_evaluation.coalesce(1).write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/default/uk_land_registry/gold_model_performance")

print("Notebook 4 Complete: Winner metrics calculated and Gold Layer exported.")

Winner Model (Logistic Regression) Accuracy: 0.3541
Weighted Precision: 0.2882
Weighted Recall: 0.3541
--- MODEL INTERPRETATION DATA ---
Coefficient Matrix (Rows=Labels, Cols=Features):
DenseMatrix([[-1.98333005],
             [-1.17752791],
             [ 1.34835099],
             [-0.06797972],
             [ 1.88048669]])
Intercept Vector per Class:
[0.6915935342989182,0.6570068166322911,0.5469963971895659,0.2878638930163585,-2.183460641137134]
Property Type 0 (Label 0) influence by Price: -1.9833
Property Type 1 (Label 1) influence by Price: -1.1775
Property Type 2 (Label 2) influence by Price: 1.3484
Property Type 3 (Label 3) influence by Price: -0.0680
Property Type 4 (Label 4) influence by Price: 1.8805
Notebook 4 Complete: Winner metrics calculated and Gold Layer exported.
