In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sqrt, abs as spark_abs
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time

In [None]:
# set a Spark session
spark = SparkSession.builder \
    .appName("PredictiveAnalysis") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/23 11:31:40 WARN Utils: Your hostname, MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.134 instead (on interface en0)
25/06/23 11:31:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 11:31:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# define the path for exploitation zone
exploitation_zone = "exploitation_zone"

In [4]:
# load preprocessed data
train_data = spark.read.parquet(f"{exploitation_zone}/train_data")
test_data = spark.read.parquet(f"{exploitation_zone}/test_data")

                                                                                

In [5]:
train_data.show(5)

+-----------+--------------------+-------------------------+
|price_clean|            features|neighborhood_n_reconciled|
+-----------+--------------------+-------------------------+
|    69500.0|(50,[0,1,2,3,4,5,...|                    Sants|
|    69500.0|(50,[0,1,2,3,4,5,...|                    Sants|
|    89000.0|(50,[0,1,2,3,4,5,...|              Hostafrancs|
|    90000.0|(50,[0,1,2,3,4,5,...|              Hostafrancs|
|    90000.0|(50,[0,1,2,3,4,5,...|              Hostafrancs|
+-----------+--------------------+-------------------------+
only showing top 5 rows


In [6]:
# we create 3 different ML models: Linear Regression, Random Forest, and Gradient Boosted Trees

# Linear Regression model
linear_regression = LinearRegression(
    featuresCol="features", 
    labelCol="price_clean",
    predictionCol="prediction")

# Random Forest model
random_forest = RandomForestRegressor(
    featuresCol="features",
    labelCol="price_clean", 
    predictionCol="prediction",
    numTrees=50,
    maxDepth=10,
    seed=42)

# Gradient Boosted Trees model
gradient_boosting = GBTRegressor(
    featuresCol="features",
    labelCol="price_clean",
    predictionCol="prediction", 
    maxIter=50,
    maxDepth=8,
    seed=42)

models = {
    "Linear Regression": linear_regression,
    "Random Forest": random_forest, 
    "Gradient Boosting": gradient_boosting}

In [7]:
# define parameter grids for each model to optimize hyperparameters

# Linear Regression parameter grid
lr_param_grid = ParamGridBuilder() \
    .addGrid(linear_regression.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Random Forest parameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [30, 50, 100]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .build()

# Gradient Boosting parameter grid  
gb_param_grid = ParamGridBuilder() \
    .addGrid(gradient_boosting.maxIter, [30, 50, 100]) \
    .addGrid(gradient_boosting.maxDepth, [5, 8, 10]) \
    .build()

param_grids = {
    "Linear Regression": lr_param_grid,
    "Random Forest": rf_param_grid,
    "Gradient Boosting": gb_param_grid}

In [8]:
# train and evaluate models using cross-validation

# evaluator for RMSE
evaluator = RegressionEvaluator(
    labelCol="price_clean",
    predictionCol="prediction",
    metricName="rmse")

# store results
trained_models = {}
model_metrics = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    start_time = time.time()
    
    # create cross-validator
    cv = CrossValidator(
        estimator=model,
        estimatorParamMaps=param_grids[model_name],
        evaluator=evaluator,
        numFolds=3,  
        seed=42)
    
    # train with cross-validation
    cv_model = cv.fit(train_data)
    
    # get best model 
    best_model = cv_model.bestModel
    
    # calculate cross-validation metrics
    cv_metrics = cv_model.avgMetrics
    best_cv_rmse = min(cv_metrics)
    
    # store results
    trained_models[model_name] = {
        "model": best_model,
        "cv_model": cv_model}
    
    model_metrics[model_name] = {
        "cv_rmse": best_cv_rmse}
    
    print(f"{model_name} completed")
    print(f"  Best CV RMSE: €{best_cv_rmse:,.0f}")


Training Linear Regression...


25/06/23 11:31:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/06/23 11:31:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Linear Regression completed
  Best CV RMSE: €213,982

Training Random Forest...


25/06/23 11:31:52 WARN DAGScheduler: Broadcasting large task binary with size 1163.4 KiB
25/06/23 11:31:52 WARN DAGScheduler: Broadcasting large task binary with size 1508.4 KiB
25/06/23 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 1163.4 KiB
25/06/23 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 1508.4 KiB
25/06/23 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 1854.3 KiB
25/06/23 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
25/06/23 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
25/06/23 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/06/23 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/06/23 11:31:55 WARN DAGScheduler: Broadcasting large task binary with size 1315.2 KiB
25/06/23 11:31:55 WARN DAGScheduler: Broadcasting large task binary with size 1817.8 KiB
25/06/23 11:31:55 WARN DAGSchedul

Random Forest completed
  Best CV RMSE: €84,138

Training Gradient Boosting...


25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1010.5 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1024.7 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1015.1 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1015.6 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1016.6 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1017.1 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1018.8 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1021.7 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1026.7 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1035.6 KiB
25/06/23 11:32:42 WARN DAGScheduler: Broadcasting large task binary with size 1049.4 KiB
25/06/23 11:32:42 WAR

Gradient Boosting completed
  Best CV RMSE: €100,307


In [9]:
# validate models on test set and calculate metrics 

# create evaluators for different metrics
rmse_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="rmse")
mae_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="mae") 
r2_evaluator = RegressionEvaluator(labelCol="price_clean", predictionCol="prediction", metricName="r2")

test_results = {}

for model_name, model_dict in trained_models.items():
    model = model_dict["model"]
    
    print(f"\nEvaluating {model_name} on test set")
    
    # make predictions
    predictions = model.transform(test_data)
    
    # calculate metrics
    rmse = rmse_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    r2 = r2_evaluator.evaluate(predictions)
    
    test_results[model_name] = {
        "rmse": rmse,
        "mae": mae, 
        "r2": r2,
        "predictions": predictions}
    
    print(f"  RMSE: €{rmse:,.0f}")
    print(f"  MAE: €{mae:,.0f}")
    print(f"  R²: {r2:.3f}")


Evaluating Linear Regression on test set
  RMSE: €226,247
  MAE: €145,587
  R²: 0.894

Evaluating Random Forest on test set
  RMSE: €61,040
  MAE: €20,625
  R²: 0.992

Evaluating Gradient Boosting on test set
  RMSE: €43,828
  MAE: €8,561
  R²: 0.996


In [12]:
# compare performance of models
print("\n" + "="*80)
print("                    MODEL PERFORMANCE COMPARISON")
print("="*80)

print(f"{'Model':<20} {'CV RMSE':<12} {'Test RMSE':<12} {'Test MAE':<12} {'Test R²':<10}")
print("-"*80)

best_model_name = None
best_rmse = float('inf')

for model_name in model_metrics.keys():
    cv_rmse = model_metrics[model_name]["cv_rmse"]
    test_rmse = test_results[model_name]["rmse"]
    test_mae = test_results[model_name]["mae"]
    test_r2 = test_results[model_name]["r2"]
    
    print(f"{model_name:<20} {cv_rmse:<12,.0f} {test_rmse:<12,.0f} {test_mae:<12,.0f} {test_r2:<10.3f}")
    
    # track best model by test RMSE
    if test_rmse < best_rmse:
        best_rmse = test_rmse
        best_model_name = model_name

print("-"*80)
print("="*80)


                    MODEL PERFORMANCE COMPARISON
Model                CV RMSE      Test RMSE    Test MAE     Test R²   
--------------------------------------------------------------------------------
Linear Regression    213,982      226,247      145,587      0.894     
Random Forest        84,138       61,040       20,625       0.992     
Gradient Boosting    100,307      43,828       8,561        0.996     
--------------------------------------------------------------------------------


In [11]:
# save best model
best_model = trained_models[best_model_name]["model"]
best_model.write().overwrite().save(f"{exploitation_zone}/best_model")

spark.stop()