In [0]:
import os
import time
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col, round

# --- 1. PERFORMANCE PROFILER SETUP ---
class PipelineProfiler:
    def __init__(self):
        self.stats = {}
    def start_timer(self, stage_name):
        self.stats[stage_name] = time.time()
        print(f"Starting {stage_name}...")
    def end_timer(self, stage_name):
        duration = time.time() - self.stats[stage_name]
        print(f"{stage_name} completed in {duration:.2f} seconds.")
        return duration

profiler = PipelineProfiler()
profiler.start_timer("Model Training & Serialization")

# --- 2. ENVIRONMENT & DATA PREP ---
temp_ml_path = "/Volumes/workspace/default/uk_land_registry/ml_temp"
os.environ['SPARKML_TEMP_DFS_PATH'] = temp_ml_path
dbutils.fs.mkdirs(temp_ml_path)

data = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

# --- 3. GEOGRAPHIC FEATURE ENCODING ---
# Indexing Town_City to provide geographic context
city_indexer = StringIndexer(inputCol="Town_City", outputCol="city_label", handleInvalid="skip")
data_with_city = city_indexer.fit(data).transform(data)

# Building final feature vector (Price + City)
assembler = VectorAssembler(inputCols=["scaled_features", "city_label"], outputCol="final_features")
final_data = assembler.transform(data_with_city)

# Temporal Split (Requirement 4a)
train_df = final_data.filter(col("Sale_Year") < 2023)
test_df = final_data.filter(col("Sale_Year") >= 2023)

# --- 4. ALGORITHM INITIALIZATION (Fixing maxBins Error) ---
# We set maxBins=1200 because Town_City has 1,173 distinct values
lr = LogisticRegression(labelCol="type_label", featuresCol="final_features", maxIter=20)

dt = DecisionTreeClassifier(labelCol="type_label", featuresCol="final_features", maxBins=1200)

rf = RandomForestClassifier(labelCol="type_label", featuresCol="final_features", numTrees=20, maxBins=1200)

lin_reg = LinearRegression(labelCol="type_label", featuresCol="final_features", maxIter=20)

# --- 5. THE TRAINING FACTORY ---
print("Training models with Geographic Context and adjusted maxBins...")
lr_model = lr.fit(train_df)
dt_model = dt.fit(train_df)
rf_model = rf.fit(train_df)
lin_model = lin_reg.fit(train_df)

# --- 6. EVALUATION ---
evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction", metricName="accuracy")

# Algo 4 conversion
lin_predictions = lin_model.transform(test_df).withColumn("prediction", round(col("prediction")))

lr_acc = evaluator.evaluate(lr_model.transform(test_df))
dt_acc = evaluator.evaluate(dt_model.transform(test_df))
rf_acc = evaluator.evaluate(rf_model.transform(test_df))
lin_acc = evaluator.evaluate(lin_predictions)

# --- 7. SERIALIZATION ---
model_path = "/Volumes/workspace/default/uk_land_registry/models/"
lr_model.write().overwrite().save(f"{model_path}lr_model")
dt_model.write().overwrite().save(f"{model_path}dt_model")
rf_model.write().overwrite().save(f"{model_path}rf_model")
lin_model.write().overwrite().save(f"{model_path}lin_model")

# --- 8. PERFORMANCE LOGGING ---
train_duration = profiler.end_timer("Model Training & Serialization")

print("-" * 30)
print(f"FINAL TRAINING STATS:")
print(f"Total Training Time: {train_duration:.2f} seconds")
print(f"LR Accuracy: {lr_acc:.4f} | DT Accuracy: {dt_acc:.4f}")
print(f"RF Accuracy: {rf_acc:.4f} | LinReg Accuracy: {lin_acc:.4f}")
print("-" * 30)

Starting Model Training & Serialization...
Training models with Geographic Context and adjusted maxBins...
Model Training & Serialization completed in 289.96 seconds.
------------------------------
FINAL TRAINING STATS:
Total Training Time: 289.96 seconds
LR Accuracy: 0.3617 | DT Accuracy: 0.4021
RF Accuracy: 0.4112 | LinReg Accuracy: 0.2741
------------------------------
