# Task B

The steps performed will be:

- Model Training

- Model Management

    Use MLflow (or a similar model management framework) to track the entire pipeline, including models, hyperparameters, evaluation metrics, and tagging the best model for deployment.

In [1]:
import os

# Set HADOOP_HOME to the parent folder of the 'bin' with winutils.exe
os.environ["HADOOP_HOME"] = r"C:\winutils-master\hadoop-3.0.0"
os.environ["PATH"] += r";C:\winutils-master\hadoop-3.0.0\bin"

# (Optional: Set JAVA_HOME if not already set)
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-17.0.15.6-hotspot"

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sqrt, abs as spark_abs
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time
import mlflow
import mlflow.spark

In [3]:
# set a Spark session
spark = SparkSession.builder \
    .appName("PredictiveAnalysis") \
    .master("local[*]") \
    .getOrCreate()

In [4]:
# define the path for exploitation zone
exploitation_zone = "exploitation_zone"

In [5]:
# load preprocessed data
train_data = spark.read.parquet(f"{exploitation_zone}/train_data")
test_data = spark.read.parquet(f"{exploitation_zone}/test_data")

In [6]:
train_data.show(5)

+-----------+--------------------+--------------------+
|price_clean|            features|        neighborhood|
+-----------+--------------------+--------------------+
|    39000.0|(91,[0,1,2,3,4,5,...|El Poble Sec - Pa...|
|    60000.0|(91,[0,1,2,3,4,5,...|          La Bordeta|
|    69500.0|(91,[0,1,2,3,4,5,...|               Sants|
|    70000.0|(91,[0,1,2,3,4,5,...|  La Marina del Port|
|    79900.0|(91,[0,2,3,4,5,6,...|El Poble Sec - Pa...|
+-----------+--------------------+--------------------+
only showing top 5 rows


## Model Training and Management

In [8]:
# we create 3 different ML models: Linear Regression, Random Forest, and Gradient Boosted Trees

# Linear Regression model
linear_regression = LinearRegression(
    featuresCol="features", 
    labelCol="price_clean",
    predictionCol="prediction")

# Random Forest model
random_forest = RandomForestRegressor(
    featuresCol="features",
    labelCol="price_clean", 
    predictionCol="prediction",
    numTrees=50,
    maxDepth=10,
    seed=42)

# Gradient Boosted Trees model
gradient_boosting = GBTRegressor(
    featuresCol="features",
    labelCol="price_clean",
    predictionCol="prediction", 
    maxIter=50,
    maxDepth=8,
    seed=42)

models = {
    "Linear Regression": linear_regression,
    "Random Forest": random_forest, 
    "Gradient Boosting": gradient_boosting}

In [10]:
# define parameter grids for each model to optimize hyperparameters

# Linear Regression parameter grid
lr_param_grid = ParamGridBuilder() \
    .addGrid(linear_regression.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Random Forest parameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [30, 50, 100]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .build()

# Gradient Boosting parameter grid  
gb_param_grid = ParamGridBuilder() \
    .addGrid(gradient_boosting.maxIter, [30, 50, 100]) \
    .addGrid(gradient_boosting.maxDepth, [5, 8, 10]) \
    .build()

param_grids = {
    "Linear Regression": lr_param_grid,
    "Random Forest": rf_param_grid,
    "Gradient Boosting": gb_param_grid}

## MLflow tracking

In [11]:
# Define evaluators
evaluator = RegressionEvaluator(labelCol="price_clean", 
                                predictionCol="prediction",
                                metricName="rmse")
best_models = {}

for model_name, param_grid in param_grids.items():
    with mlflow.start_run(run_name=model_name):
        # Set up CrossValidator for each model
        cv = CrossValidator(
            estimator=models[model_name],
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=3
        )
        cv_model = cv.fit(train_data)
        best_model = cv_model.bestModel

        # Predict on validation/test set
        predictions = best_model.transform(test_data)
        rmse = evaluator.evaluate(predictions)

        # Log hyperparameters (from bestModel's params)
        for param, value in best_model.extractParamMap().items():
            mlflow.log_param(str(param), value)

        # Log evaluation metric
        mlflow.log_metric("rmse", rmse)

        # Log model
        mlflow.spark.log_model(best_model, "model")

        # Store best model for later ranking/deployment
        best_models[model_name] = (best_model, rmse)
        print(f"{model_name} best RMSE: {rmse:.4f}")

# After all runs, you can rank/select/deploy the best model as needed



Error: [('C:\\Users\\aleja\\AppData\\Local\\Temp\\tmprdychz7g\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\data\\.part-00000-4dc86174-acc1-4e8f-b532-85e7e5896670-c000.snappy.parquet.crc', 'C:\\Users\\aleja\\OneDrive\\Escritorio\\Term_3\\Big_data_management\\Lab_3\\Lab3-Assignment\\BDM-lab3\\mlruns\\0\\6571a2cc92684531be82a95b1dab2d72\\artifacts\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\data\\.part-00000-4dc86174-acc1-4e8f-b532-85e7e5896670-c000.snappy.parquet.crc', "[Errno 2] No such file or directory: 'C:\\\\Users\\\\aleja\\\\OneDrive\\\\Escritorio\\\\Term_3\\\\Big_data_management\\\\Lab_3\\\\Lab3-Assignment\\\\BDM-lab3\\\\mlruns\\\\0\\\\6571a2cc92684531be82a95b1dab2d72\\\\artifacts\\\\model\\\\sparkml\\\\stages\\\\0_LinearRegression_5e75d3ba3256\\\\data\\\\.part-00000-4dc86174-acc1-4e8f-b532-85e7e5896670-c000.snappy.parquet.crc'"), ('C:\\Users\\aleja\\AppData\\Local\\Temp\\tmprdychz7g\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\data\\part-00000-4dc86174-acc1-4e8f-b532-85e7e5896670-c000.snappy.parquet', 'C:\\Users\\aleja\\OneDrive\\Escritorio\\Term_3\\Big_data_management\\Lab_3\\Lab3-Assignment\\BDM-lab3\\mlruns\\0\\6571a2cc92684531be82a95b1dab2d72\\artifacts\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\data\\part-00000-4dc86174-acc1-4e8f-b532-85e7e5896670-c000.snappy.parquet', "[Errno 2] No such file or directory: 'C:\\\\Users\\\\aleja\\\\OneDrive\\\\Escritorio\\\\Term_3\\\\Big_data_management\\\\Lab_3\\\\Lab3-Assignment\\\\BDM-lab3\\\\mlruns\\\\0\\\\6571a2cc92684531be82a95b1dab2d72\\\\artifacts\\\\model\\\\sparkml\\\\stages\\\\0_LinearRegression_5e75d3ba3256\\\\data\\\\part-00000-4dc86174-acc1-4e8f-b532-85e7e5896670-c000.snappy.parquet'"), ('C:\\Users\\aleja\\AppData\\Local\\Temp\\tmprdychz7g\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\metadata\\.part-00000-84857947-49ab-48e6-963f-aa106e372a85-c000.txt.crc', 'C:\\Users\\aleja\\OneDrive\\Escritorio\\Term_3\\Big_data_management\\Lab_3\\Lab3-Assignment\\BDM-lab3\\mlruns\\0\\6571a2cc92684531be82a95b1dab2d72\\artifacts\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\metadata\\.part-00000-84857947-49ab-48e6-963f-aa106e372a85-c000.txt.crc', "[Errno 2] No such file or directory: 'C:\\\\Users\\\\aleja\\\\OneDrive\\\\Escritorio\\\\Term_3\\\\Big_data_management\\\\Lab_3\\\\Lab3-Assignment\\\\BDM-lab3\\\\mlruns\\\\0\\\\6571a2cc92684531be82a95b1dab2d72\\\\artifacts\\\\model\\\\sparkml\\\\stages\\\\0_LinearRegression_5e75d3ba3256\\\\metadata\\\\.part-00000-84857947-49ab-48e6-963f-aa106e372a85-c000.txt.crc'"), ('C:\\Users\\aleja\\AppData\\Local\\Temp\\tmprdychz7g\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\metadata\\part-00000-84857947-49ab-48e6-963f-aa106e372a85-c000.txt', 'C:\\Users\\aleja\\OneDrive\\Escritorio\\Term_3\\Big_data_management\\Lab_3\\Lab3-Assignment\\BDM-lab3\\mlruns\\0\\6571a2cc92684531be82a95b1dab2d72\\artifacts\\model\\sparkml\\stages\\0_LinearRegression_5e75d3ba3256\\metadata\\part-00000-84857947-49ab-48e6-963f-aa106e372a85-c000.txt', "[Errno 2] No such file or directory: 'C:\\\\Users\\\\aleja\\\\OneDrive\\\\Escritorio\\\\Term_3\\\\Big_data_management\\\\Lab_3\\\\Lab3-Assignment\\\\BDM-lab3\\\\mlruns\\\\0\\\\6571a2cc92684531be82a95b1dab2d72\\\\artifacts\\\\model\\\\sparkml\\\\stages\\\\0_LinearRegression_5e75d3ba3256\\\\metadata\\\\part-00000-84857947-49ab-48e6-963f-aa106e372a85-c000.txt'")]

In [10]:
# validate models on test set and calculate metrics 

# create evaluators for different metrics
rmse_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="rmse")
mae_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="mae") 
r2_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="r2")

test_results = {}

for model_name, model_dict in trained_models.items():
    model = model_dict["model"]
    
    print(f"\nEvaluating {model_name} on test set")
    
    # make predictions
    predictions = model.transform(test_data)
    
    # calculate metrics
    rmse = rmse_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    r2 = r2_evaluator.evaluate(predictions)
    
    test_results[model_name] = {
        "rmse": rmse,
        "mae": mae, 
        "r2": r2,
        "predictions": predictions}
    
    print(f"  RMSE: €{rmse:,.0f}")
    print(f"  MAE: €{mae:,.0f}")
    print(f"  R²: {r2:.3f}")


Evaluating Linear Regression on test set
  RMSE: €191,385
  MAE: €116,727
  R²: 0.769

Evaluating Random Forest on test set
  RMSE: €158,171
  MAE: €85,683
  R²: 0.842

Evaluating Gradient Boosting on test set
  RMSE: €164,549
  MAE: €93,493
  R²: 0.829


In [11]:
# compare performance of models
print("\n" + "="*80)
print("                    MODEL PERFORMANCE COMPARISON")
print("="*80)

print(f"{'Model':<20} {'CV RMSE':<12} {'Test RMSE':<12} {'Test MAE':<12} {'Test R²':<10}")
print("-"*80)

best_model_name = None
best_rmse = float('inf')

for model_name in model_metrics.keys():
    cv_rmse = model_metrics[model_name]["cv_rmse"]
    test_rmse = test_results[model_name]["rmse"]
    test_mae = test_results[model_name]["mae"]
    test_r2 = test_results[model_name]["r2"]
    
    print(f"{model_name:<20} {cv_rmse:<12,.0f} {test_rmse:<12,.0f} {test_mae:<12,.0f} {test_r2:<10.3f}")
    
    # track best model by test RMSE
    if test_rmse < best_rmse:
        best_rmse = test_rmse
        best_model_name = model_name

print("-"*80)
print("="*80)


                    MODEL PERFORMANCE COMPARISON
Model                CV RMSE      Test RMSE    Test MAE     Test R²   
--------------------------------------------------------------------------------
Linear Regression    193,145      191,385      116,727      0.769     
Random Forest        165,725      158,171      85,683       0.842     
Gradient Boosting    184,602      164,549      93,493       0.829     
--------------------------------------------------------------------------------


In [12]:
# save best model
best_model = trained_models[best_model_name]["model"]
best_model.write().overwrite().save(f"{exploitation_zone}/best_model")

spark.stop()