In [0]:
import os

# --- TECHNICAL REQUIREMENT: Serverless ML Scratch Space ---
# We must point Spark ML to a UC Volume for temporary checkpointing
temp_ml_path = "/Volumes/workspace/default/uk_land_registry/ml_temp"

# Ensure the directory exists
dbutils.fs.mkdirs(temp_ml_path)

# Set the environment variable so the CrossValidator knows where to 'spill' data
os.environ['SPARKML_TEMP_DFS_PATH'] = temp_ml_path

print(f"Serverless ML environment configured. Temp path set to: {temp_ml_path}")

Serverless ML environment configured. Temp path set to: /Volumes/workspace/default/uk_land_registry/ml_temp


In [0]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1. Load our Scaled Silver Data
data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# 2. Data Splitting Strategy
train_df, test_df = data.randomSplit([0.8, 0.2], seed=42)

# --- TECHNICAL REQUIREMENT: Define Evaluator ---
# This must be defined BEFORE the CrossValidator uses it
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction", metricName="accuracy")

# --- TECHNICAL REQUIREMENT 2a: Implement at least 3 MLlib algorithms ---

# Algorithm 1: Decision Tree (with Hyperparameter Tuning)
dt = DecisionTreeClassifier(labelCol="type_label", featuresCol="scaled_features")
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [5, 10]).build()

cv = CrossValidator(estimator=dt, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=3)

print("Training Algorithm 1: Decision Tree (with CV)...")
dt_model = cv.fit(train_df)

# Algorithm 2: Random Forest
print("Training Algorithm 2: Random Forest...")
rf = RandomForestClassifier(labelCol="type_label", featuresCol="scaled_features", numTrees=10)
rf_model = rf.fit(train_df)

# Algorithm 3: Logistic Regression
print("Training Algorithm 3: Logistic Regression...")
lr = LogisticRegression(labelCol="type_label", featuresCol="scaled_features", maxIter=10)
lr_model = lr.fit(train_df)

# --- TECHNICAL REQUIREMENT 2a: Model Serialization ---
# Saving the best version of each algorithm
dt_model.bestModel.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/best_dt_model")
rf_model.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/rf_model")
lr_model.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/lr_model")

print("Notebook 3 Complete: 3 Algorithms Trained and Serialized.")

Training Algorithm 1: Decision Tree (with CV)...
Training Algorithm 2: Random Forest...
Training Algorithm 3: Logistic Regression...
Notebook 3 Complete: 3 Algorithms Trained and Serialized.
