# Task B

The steps performed will be:

- Model Training

- Model Management

    Use MLflow (or a similar model management framework) to track the entire pipeline, including models, hyperparameters, evaluation metrics, and tagging the best model for deployment.

In [1]:
import os

# Set HADOOP_HOME to the parent folder of the 'bin' with winutils.exe
os.environ["HADOOP_HOME"] = r"C:\winutils-master\hadoop-3.0.0"
os.environ["PATH"] += r";C:\winutils-master\hadoop-3.0.0\bin"

# (Optional: Set JAVA_HOME if not already set)
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-17.0.15.6-hotspot"

In [2]:
import mlflow

# Check and print your current working directory
print("Current working directory:", os.getcwd())

# Set MLflow tracking URI to a local path OUTSIDE OneDrive
mlflow.set_tracking_uri("file:///C:/BDM3/mlruns")
print("MLflow tracking URI:", mlflow.get_tracking_uri())


Current working directory: c:\BDM3
MLflow tracking URI: file:///C:/BDM3/mlruns


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sqrt, abs as spark_abs
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time

In [4]:
# set a Spark session
spark = SparkSession.builder \
    .appName("PredictiveAnalysis") \
    .master("local[*]") \
    .getOrCreate()

In [5]:
# define the path for exploitation zone
exploitation_zone = "exploitation_zone"

In [6]:
# load preprocessed data
train_data = spark.read.parquet(f"{exploitation_zone}/train_data")
test_data = spark.read.parquet(f"{exploitation_zone}/test_data")

In [7]:
train_data.show(5)

+-----------+--------------------+--------------------+
|price_clean|            features|        neighborhood|
+-----------+--------------------+--------------------+
|    39000.0|(91,[0,1,2,3,4,5,...|El Poble Sec - Pa...|
|    60000.0|(91,[0,1,2,3,4,5,...|          La Bordeta|
|    69500.0|(91,[0,1,2,3,4,5,...|               Sants|
|    70000.0|(91,[0,1,2,3,4,5,...|  La Marina del Port|
|    79900.0|(91,[0,2,3,4,5,6,...|El Poble Sec - Pa...|
+-----------+--------------------+--------------------+
only showing top 5 rows


## Model Training and Management

In [8]:
# we create 3 different ML models: Linear Regression, Random Forest, and Gradient Boosted Trees

# Linear Regression model
linear_regression = LinearRegression(
    featuresCol="features", 
    labelCol="price_clean",
    predictionCol="prediction")

# Random Forest model
random_forest = RandomForestRegressor(
    featuresCol="features",
    labelCol="price_clean", 
    predictionCol="prediction",
    numTrees=50,
    maxDepth=10,
    seed=42)

# Gradient Boosted Trees model
gradient_boosting = GBTRegressor(
    featuresCol="features",
    labelCol="price_clean",
    predictionCol="prediction", 
    maxIter=50,
    maxDepth=8,
    seed=42)

models = {
    "Linear Regression": linear_regression,
    "Random Forest": random_forest, 
    "Gradient Boosting": gradient_boosting}

In [9]:
# define parameter grids for each model to optimize hyperparameters

# Linear Regression parameter grid
lr_param_grid = ParamGridBuilder() \
    .addGrid(linear_regression.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Random Forest parameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [30, 50, 100]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .build()

# Gradient Boosting parameter grid  
gb_param_grid = ParamGridBuilder() \
    .addGrid(gradient_boosting.maxIter, [30, 50, 100]) \
    .addGrid(gradient_boosting.maxDepth, [5, 8, 10]) \
    .build()

param_grids = {
    "Linear Regression": lr_param_grid,
    "Random Forest": rf_param_grid,
    "Gradient Boosting": gb_param_grid}

## MLflow tracking

In [10]:
# Define evaluators
evaluator_rmse = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="rmse")
evaluator_mae  = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="mae")
evaluator_r2   = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="r2")

best_models = {}

for model_name, param_grid in param_grids.items():
    with mlflow.start_run(run_name=model_name):
        cv = CrossValidator(
            estimator=models[model_name],
            estimatorParamMaps=param_grid,
            evaluator=evaluator_rmse,
            numFolds=3
        )
        cv_model = cv.fit(train_data)
        best_model = cv_model.bestModel

        # CV RMSE for the best param set
        best_cv_rmse = min(cv_model.avgMetrics)

        # Predict on validation/test set
        predictions = best_model.transform(test_data)
        test_rmse = evaluator_rmse.evaluate(predictions)
        test_mae  = evaluator_mae.evaluate(predictions)
        test_r2   = evaluator_r2.evaluate(predictions)

        # Log hyperparameters (from bestModel's params)
        for param, value in best_model.extractParamMap().items():
            mlflow.log_param(str(param), value)

        # Log evaluation metrics
        mlflow.log_metric("cv_rmse", best_cv_rmse)
        mlflow.log_metric("test_rmse", test_rmse)
        mlflow.log_metric("test_mae", test_mae)
        mlflow.log_metric("test_r2", test_r2)

        # Log model
        mlflow.spark.log_model(best_model, "model")

        # Store everything for later comparison
        best_models[model_name] = {
            "model": best_model,
            "cv_rmse": best_cv_rmse,
            "test_rmse": test_rmse,
            "test_mae": test_mae,
            "test_r2": test_r2
        }
        print(f"{model_name} | CV RMSE: {best_cv_rmse:.2f} | Test RMSE: {test_rmse:.2f} | Test MAE: {test_mae:.2f} | Test R²: {test_r2:.3f}")




Linear Regression | CV RMSE: 195342.42 | Test RMSE: 182560.91 | Test MAE: 115873.27 | Test R²: 0.791




Random Forest | CV RMSE: 166668.23 | Test RMSE: 150323.47 | Test MAE: 86936.29 | Test R²: 0.858




Gradient Boosting | CV RMSE: 187190.23 | Test RMSE: 158117.66 | Test MAE: 96640.69 | Test R²: 0.843


In [13]:
print("\n" + "="*80)
print("                    MODEL PERFORMANCE COMPARISON")
print("="*80)
print(f"{'Model':<20} {'CV RMSE':<12} {'Test RMSE':<12} {'Test MAE':<12} {'Test R²':<10}")
print("-"*80)

best_model_name = None
best_rmse = float('inf')

for model_name, info in best_models.items():
    print(f"{model_name:<20} €{info['cv_rmse']:<11,.0f} €{info['test_rmse']:<11,.0f} €{info['test_mae']:<11,.0f} {info['test_r2']:<10.3f}")
    if info['test_rmse'] < best_rmse:
        best_rmse = info['test_rmse']
        best_model_name = model_name

print("-"*80)
print("="*80)
print(f"\nBest model by test RMSE: {best_model_name} (RMSE = €{best_rmse:,.0f})")



                    MODEL PERFORMANCE COMPARISON
Model                CV RMSE      Test RMSE    Test MAE     Test R²   
--------------------------------------------------------------------------------
Linear Regression    €195,342     €182,561     €115,873     0.791     
Random Forest        €166,668     €150,323     €86,936      0.858     
Gradient Boosting    €187,190     €158,118     €96,641      0.843     
--------------------------------------------------------------------------------

Best model by test RMSE: Random Forest (RMSE = €150,323)


In [18]:
#  Storing the best model
# Get the best model from your best_models dict
best_model = best_models[best_model_name]["model"] 
with mlflow.start_run(run_name="best_model_final"):
    mlflow.spark.log_model(best_model, "model")
    mlflow.log_metric("rmse", best_models[best_model_name]["test_rmse"])



In [None]:
# Save with Spark Also
best_model.write().overwrite().save(f"{exploitation_zone}/best_model")

In [20]:
spark.stop()