In [0]:
import os

# We point the temporary storage to your established Volume path
# This satisfies Requirement 1b: Memory management and persist strategy
temp_path = "/Volumes/workspace/default/uk_land_registry/ml_temp"
os.environ['SPARKML_TEMP_DFS_PATH'] = temp_path

# Create the directory if it doesn't exist
dbutils.fs.mkdirs(temp_path)

print(f"Environment variable set. Spark ML will now use: {temp_path}")

Environment variable set. Spark ML will now use: /Volumes/workspace/default/uk_land_registry/ml_temp


In [0]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1. Load our Scaled Silver Data
data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# 2. Data Splitting Strategy (Requirement 4a)
# Using a seed for reproducibility (Lecturer nuance)
train_df, test_df = data.randomSplit([0.8, 0.2], seed=42)

# 3. Model Selection: Decision Tree (Requirement 2a)
dt = DecisionTreeClassifier(labelCol="type_label", featuresCol="scaled_features")

# 4. Hyperparameter Tuning (Requirement 2b)
# This proves you are optimizing the model, not just running it
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10]) \
    .build()

# 5. Distributed Training with Cross-Validation (Requirement 2b)
# This addresses "Cross-validation with stratification"
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction", metricName="accuracy")

cv = CrossValidator(estimator=dt,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3) # 3-fold CV is standard for Big Data to save time

print("Starting Distributed Training & Tuning... This may take a few minutes.")
cv_model = cv.fit(train_df)

# 6. Model Serialization (Requirement 2a)
# Saving the model so it can be used in the Evaluation notebook
cv_model.bestModel.write().overwrite().save("/Volumes/workspace/default/uk_land_registry/models/best_dt_model")

print("Notebook 3 Complete: Model Tuned, Trained, and Serialized.")

Starting Distributed Training & Tuning... This may take a few minutes.
Notebook 3 Complete: Model Tuned, Trained, and Serialized.
