In [0]:
import os

# --- TECHNICAL REQUIREMENT: Serverless ML Scratch Space ---
# We must point Spark ML to a UC Volume for temporary checkpointing
temp_ml_path = "/Volumes/workspace/default/uk_land_registry/ml_temp"

# Ensure the directory exists
dbutils.fs.mkdirs(temp_ml_path)

# Set the environment variable so the CrossValidator knows where to 'spill' data
os.environ['SPARKML_TEMP_DFS_PATH'] = temp_ml_path

print(f"Serverless ML environment configured. Temp path set to: {temp_ml_path}")

Serverless ML environment configured. Temp path set to: /Volumes/workspace/default/uk_land_registry/ml_temp


In [0]:
import os
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, round

# --- TECHNICAL REQUIREMENT: Serverless ML Scratch Space ---
temp_ml_path = "/Volumes/workspace/default/uk_land_registry/ml_temp"
os.environ['SPARKML_TEMP_DFS_PATH'] = temp_ml_path
dbutils.fs.mkdirs(temp_ml_path)

# --- 1. Load Data & Apply Temporal Split (Requirement 4a) ---
# We train on the past (pre-2023) and test on the future (2023+)
data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# Force materialization to ensure data is ready for multiple passes
print(f"Materializing {data.count()} rows for stable processing...")

train_df = data.filter(col("Sale_Year") < 2023)
test_df = data.filter(col("Sale_Year") >= 2023)

print(f"Temporal Split Complete: Training on pre-2023 data, Testing on 2023-2024 data.")

# --- 2. Initialize the 4 Algorithm Families (Requirement 2a) ---

# 1. Logistic Regression (Linear Classification)
lr = LogisticRegression(labelCol="type_label", featuresCol="scaled_features", maxIter=10)

# 2. Decision Tree (Non-linear Logic)
dt = DecisionTreeClassifier(labelCol="type_label", featuresCol="scaled_features")

# 3. Random Forest (Ensemble Learning)
rf = RandomForestClassifier(labelCol="type_label", featuresCol="scaled_features", numTrees=10)

# 4. Linear Regression (Regression-to-Classification via Rounding)
# Note: This is a highly stable 'Big Data' fallback for massive row counts
lin_reg = LinearRegression(labelCol="type_label", featuresCol="scaled_features", maxIter=10)

# --- 3. The Training Factory ---
print("Training started for 4 distinct algorithm families...")
lr_model = lr.fit(train_df)
dt_model = dt.fit(train_df)
rf_model = rf.fit(train_df)
lin_model = lin_reg.fit(train_df)

# --- 4. Evaluation ---
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction", metricName="accuracy")

# Special handling for Algo 4: Convert continuous regression to a discrete label
lin_predictions = lin_model.transform(test_df).withColumn("prediction", round(col("prediction")))

print("Evaluating all models...")
lr_acc = evaluator.evaluate(lr_model.transform(test_df))
dt_acc = evaluator.evaluate(dt_model.transform(test_df))
rf_acc = evaluator.evaluate(rf_model.transform(test_df))
lin_acc = evaluator.evaluate(lin_predictions)

print("-" * 30)
print(f"1. Logistic Regression Accuracy: {lr_acc:.4f}")
print(f"2. Decision Tree Accuracy: {dt_acc:.4f}")
print(f"3. Random Forest Accuracy: {rf_acc:.4f}")
print(f"4. Linear Regression (Rounded) Accuracy: {lin_acc:.4f}")
print("-" * 30)

# --- 5. Serialization (Requirement 2a) ---
lr_model.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/lr_model")
dt_model.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/dt_model")
rf_model.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/rf_model")
lin_model.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/lin_model")

print("Notebook 3 Complete: 4 Algorithms Serialized successfully.")

Materializing 30906560 rows for stable processing...
Temporal Split Complete: Training on pre-2023 data, Testing on 2023-2024 data.
Training started for 4 distinct algorithm families...
Evaluating all models...
------------------------------
1. Logistic Regression Accuracy: 0.3676
2. Decision Tree Accuracy: 0.3381
3. Random Forest Accuracy: 0.3394
4. Linear Regression (Rounded) Accuracy: 0.2743
------------------------------
Notebook 3 Complete: 4 Algorithms Serialized successfully.
