In [1]:
import os

# Set HADOOP_HOME to the parent folder of the 'bin' with winutils.exe
os.environ["HADOOP_HOME"] = r"C:\winutils-master\hadoop-3.0.0"
os.environ["PATH"] += r";C:\winutils-master\hadoop-3.0.0\bin"

# (Optional: Set JAVA_HOME if not already set)
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-17.0.15.6-hotspot"

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sqrt, abs as spark_abs
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time

In [3]:
# set a Spark session
spark = SparkSession.builder \
    .appName("PredictiveAnalysis") \
    .master("local[*]") \
    .getOrCreate()

In [4]:
# define the path for exploitation zone
exploitation_zone = "exploitation_zone"

In [5]:
# load preprocessed data
train_data = spark.read.parquet(f"{exploitation_zone}/train_data")
test_data = spark.read.parquet(f"{exploitation_zone}/test_data")

In [6]:
train_data.show(5)

+-----------+--------------------+--------------------+
|price_clean|            features|        neighborhood|
+-----------+--------------------+--------------------+
|    34000.0|(91,[0,3,4,5,6,7,...|La Nova Esquerra ...|
|    39000.0|(91,[0,1,2,3,4,5,...|El Poble Sec - Pa...|
|    48000.0|(91,[0,1,2,3,4,5,...|La Nova Esquerra ...|
|    60000.0|(91,[0,1,2,3,4,5,...|          La Bordeta|
|    69000.0|(91,[0,1,2,3,4,5,...|          La Bordeta|
+-----------+--------------------+--------------------+
only showing top 5 rows


In [7]:
# we create 3 different ML models: Linear Regression, Random Forest, and Gradient Boosted Trees

# Linear Regression model
linear_regression = LinearRegression(
    featuresCol="features", 
    labelCol="price_clean",
    predictionCol="prediction")

# Random Forest model
random_forest = RandomForestRegressor(
    featuresCol="features",
    labelCol="price_clean", 
    predictionCol="prediction",
    numTrees=50,
    maxDepth=10,
    seed=42)

# Gradient Boosted Trees model
gradient_boosting = GBTRegressor(
    featuresCol="features",
    labelCol="price_clean",
    predictionCol="prediction", 
    maxIter=50,
    maxDepth=8,
    seed=42)

models = {
    "Linear Regression": linear_regression,
    "Random Forest": random_forest, 
    "Gradient Boosting": gradient_boosting}

In [8]:
# define parameter grids for each model to optimize hyperparameters

# Linear Regression parameter grid
lr_param_grid = ParamGridBuilder() \
    .addGrid(linear_regression.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Random Forest parameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [30, 50, 100]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .build()

# Gradient Boosting parameter grid  
gb_param_grid = ParamGridBuilder() \
    .addGrid(gradient_boosting.maxIter, [30, 50, 100]) \
    .addGrid(gradient_boosting.maxDepth, [5, 8, 10]) \
    .build()

param_grids = {
    "Linear Regression": lr_param_grid,
    "Random Forest": rf_param_grid,
    "Gradient Boosting": gb_param_grid}

In [9]:
# train and evaluate models using cross-validation

# evaluator for RMSE
evaluator = RegressionEvaluator(
    labelCol="price_clean",
    predictionCol="prediction",
    metricName="rmse")

# store results
trained_models = {}
model_metrics = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    start_time = time.time()
    
    # create cross-validator
    cv = CrossValidator(
        estimator=model,
        estimatorParamMaps=param_grids[model_name],
        evaluator=evaluator,
        numFolds=3,  
        seed=42)
    
    # train with cross-validation
    cv_model = cv.fit(train_data)
    
    # get best model 
    best_model = cv_model.bestModel
    
    # calculate cross-validation metrics
    cv_metrics = cv_model.avgMetrics
    best_cv_rmse = min(cv_metrics)
    
    # store results
    trained_models[model_name] = {
        "model": best_model,
        "cv_model": cv_model}
    
    model_metrics[model_name] = {
        "cv_rmse": best_cv_rmse}
    
    print(f"{model_name} completed")
    print(f"  Best CV RMSE: €{best_cv_rmse:,.0f}")


Training Linear Regression...
Linear Regression completed
  Best CV RMSE: €193,145

Training Random Forest...
Random Forest completed
  Best CV RMSE: €165,725

Training Gradient Boosting...
Gradient Boosting completed
  Best CV RMSE: €184,602


In [10]:
# validate models on test set and calculate metrics 

# create evaluators for different metrics
rmse_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="rmse")
mae_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="mae") 
r2_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="r2")

test_results = {}

for model_name, model_dict in trained_models.items():
    model = model_dict["model"]
    
    print(f"\nEvaluating {model_name} on test set")
    
    # make predictions
    predictions = model.transform(test_data)
    
    # calculate metrics
    rmse = rmse_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    r2 = r2_evaluator.evaluate(predictions)
    
    test_results[model_name] = {
        "rmse": rmse,
        "mae": mae, 
        "r2": r2,
        "predictions": predictions}
    
    print(f"  RMSE: €{rmse:,.0f}")
    print(f"  MAE: €{mae:,.0f}")
    print(f"  R²: {r2:.3f}")


Evaluating Linear Regression on test set
  RMSE: €191,385
  MAE: €116,727
  R²: 0.769

Evaluating Random Forest on test set
  RMSE: €158,171
  MAE: €85,683
  R²: 0.842

Evaluating Gradient Boosting on test set
  RMSE: €164,549
  MAE: €93,493
  R²: 0.829


In [11]:
# compare performance of models
print("\n" + "="*80)
print("                    MODEL PERFORMANCE COMPARISON")
print("="*80)

print(f"{'Model':<20} {'CV RMSE':<12} {'Test RMSE':<12} {'Test MAE':<12} {'Test R²':<10}")
print("-"*80)

best_model_name = None
best_rmse = float('inf')

for model_name in model_metrics.keys():
    cv_rmse = model_metrics[model_name]["cv_rmse"]
    test_rmse = test_results[model_name]["rmse"]
    test_mae = test_results[model_name]["mae"]
    test_r2 = test_results[model_name]["r2"]
    
    print(f"{model_name:<20} {cv_rmse:<12,.0f} {test_rmse:<12,.0f} {test_mae:<12,.0f} {test_r2:<10.3f}")
    
    # track best model by test RMSE
    if test_rmse < best_rmse:
        best_rmse = test_rmse
        best_model_name = model_name

print("-"*80)
print("="*80)


                    MODEL PERFORMANCE COMPARISON
Model                CV RMSE      Test RMSE    Test MAE     Test R²   
--------------------------------------------------------------------------------
Linear Regression    193,145      191,385      116,727      0.769     
Random Forest        165,725      158,171      85,683       0.842     
Gradient Boosting    184,602      164,549      93,493       0.829     
--------------------------------------------------------------------------------


In [12]:
# save best model
best_model = trained_models[best_model_name]["model"]
best_model.write().overwrite().save(f"{exploitation_zone}/best_model")

spark.stop()