In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, rank, expr
from pyspark.sql.window import Window
import itertools

In [2]:
spark = (
    SparkSession.builder.appName("Collaborative Filtering")  # type: ignore
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    )
    .getOrCreate()
)

24/05/21 15:17:42 WARN Utils: Your hostname, Davids-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.7 instead (on interface en0)
24/05/21 15:17:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/david/.ivy2/cache
The jars for the packages stored in: /Users/david/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8f5649c7-9e88-4e94-9576-58d7ff60e01d;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/david/eafit/proyecto-integrador-semestre-2/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 130ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-8f5649c7-9e88-4e94-9576-58d7ff60e01d
	confs: [default]


In [3]:
# Leer el archivo Parquet
data = spark.read.parquet("s3a://amazon-reviews-eafit/sample-for-model/")

24/05/21 15:17:45 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [4]:
data.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- star_rating: float (nullable = true)
 |-- category: string (nullable = true)
 |-- review_date: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)



In [5]:
data.count()

24/05/21 15:17:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

6786913

In [6]:
indexer = StringIndexer(inputCol="product_id", outputCol="item_id")

indexer_model = indexer.fit(data)

inverter = IndexToString(
    inputCol="item_id", outputCol="original_item_id", labels=indexer_model.labels
)

inverter.write().overwrite().save("./inverter")

24/05/21 15:19:52 WARN TaskSetManager: Stage 7 contains a task of very large size (32494 KiB). The maximum recommended task size is 1000 KiB.


In [7]:
data = indexer_model.transform(data)

In [8]:
loaded_inverter = IndexToString.load("./inverter")

## Split de la data

Se hace un sampleo por customer id. Se toma el 80% de los customer id para el train y el 20% restante para el test.
Se repite el proceso para obtener el conjunto de validación partiendo de la data de training.


In [9]:
def split_data(data, percent_items_to_mask=0.2):
    user_window = Window.partitionBy("customer_id").orderBy(col("product_id").desc())
    data_processed = data.withColumn(
        "number_of_products", expr("count(*) over (partition by customer_id)")
    )
    data_processed = data_processed.withColumn(
        "number_of_products_to_mask",
        (col("number_of_products") * percent_items_to_mask).cast("int"),
    )
    data_processed = data_processed.withColumn("product_rank", rank().over(user_window))

    training = data_processed.filter(
        col("product_rank") > col("number_of_products_to_mask")
    )
    test = data_processed.filter(
        col("product_rank") <= col("number_of_products_to_mask")
    )

    return training, test

In [10]:
training, test = split_data(data, percent_items_to_mask=0.2)
training, validation = split_data(training, percent_items_to_mask=0.2)

training_count = training.count()
validation_count = validation.count()
test_count = test.count()

print(f"Training count: {training_count}")
print(f"Validation count: {validation_count}")
print(f"Test count: {test_count}")

[Stage 23:>                                                         (0 + 8) / 9]

Training count: 4991283
Validation count: 767932
Test count: 1027698


                                                                                

In [11]:
training.select("customer_id").distinct().show()

[Stage 29:>                                                         (0 + 1) / 1]

+-----------+
|customer_id|
+-----------+
|      10128|
|      10257|
|      10293|
|      10558|
|      11125|
|      11346|
|      11675|
|      11710|
|      12027|
|      12494|
|      13916|
|      14318|
|      14377|
|      14752|
|      15191|
|      15604|
|      15655|
|      15960|
|      16150|
|      16684|
+-----------+
only showing top 20 rows



                                                                                

## Validación con un usuario

Validamos cuántos datos tiene un usuario en total, cuántos quedaron en training, test y validation.


In [12]:
data[data["customer_id"] == 76286].count()

                                                                                

0

In [13]:
training[training["customer_id"] == 76286].count()

                                                                                

0

In [14]:
validation[validation["customer_id"] == 76286].count()

                                                                                

0

In [15]:
test[test["customer_id"] == 76286].count()

                                                                                

0

## Función para entrenar el modelo

Esta función recibe dos parámetros: `maxIter` y `regParam`. Esto debido a que más adelante se hará un grid search para ver la mejor combinación de parámetros.


In [16]:
def train_model(maxIter=5, regParam=0.1, rank=10):
    als = ALS(
        maxIter=maxIter,
        regParam=regParam,
        userCol="customer_id",
        itemCol="item_id",
        ratingCol="star_rating",
        seed=42,
        nonnegative=True,
        rank=rank,
        coldStartStrategy="drop",
    )
    model = als.fit(training)
    return model

In [17]:
model = train_model()

24/05/21 15:23:54 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:24:31 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:25:17 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:25:23 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:25:33 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:25:42 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:25:53 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:26:01 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:26:09 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:26:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/21 15:26:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [18]:
def get_metrics(dataset, model):
    predictions = model.transform(dataset)

    print(f"Predictions count: {predictions.count()}")
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="star_rating", predictionCol="prediction"
    )
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error = {rmse}")

    evaluator_mae = RegressionEvaluator(
        metricName="mae", labelCol="star_rating", predictionCol="prediction"
    )
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean absolute error = {mae}")

    return rmse, mae

In [19]:
get_metrics(test, model)

24/05/21 15:27:58 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:28:02 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:28:04 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:28:35 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:28:52 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:29:05 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
                                                                                

Predictions count: 581092


24/05/21 15:29:14 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:29:21 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:29:24 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:29:57 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:30:09 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:30:20 WARN DAGScheduler: Broadcasting large task binary with size 78.2 MiB
24/05/21 15:30:27 WARN DAGScheduler: Broadcasting large task binary with size 78.2 MiB
                                                                                

Root-mean-square error = 1.7522023122708883


24/05/21 15:30:32 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:30:36 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:30:38 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:31:09 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:31:26 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:31:39 WARN DAGScheduler: Broadcasting large task binary with size 78.2 MiB
24/05/21 15:31:50 WARN DAGScheduler: Broadcasting large task binary with size 78.2 MiB
[Stage 276:>                                                        (0 + 3) / 3]

Mean absolute error = 1.4196673197396363


                                                                                

(1.7522023122708883, 1.4196673197396363)

In [20]:
get_metrics(validation, model)

24/05/21 15:31:55 WARN DAGScheduler: Broadcasting large task binary with size 78.0 MiB
24/05/21 15:32:00 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
24/05/21 15:32:03 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
ERROR:root:KeyboardInterrupt while sending command.][Stage 293:> (0 + 1) / 10]
Traceback (most recent call last):
  File "/Users/david/eafit/proyecto-integrador-semestre-2/.venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/david/eafit/proyecto-integrador-semestre-2/.venv/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/david/Library/Application Support/pdm/python/cpython@3.11.8/lib/python3.11/socket.py", line 706, in readinto
    return self.

KeyboardInterrupt: 

24/05/21 15:32:44 WARN DAGScheduler: Broadcasting large task binary with size 78.1 MiB
[Stage 295:>                                                        (0 + 8) / 9]

In [None]:
parameters = {
    "maxIter": [5, 10, 15],
    "regParam": [0.001, 0.01, 0.1],
    "rank": [1, 5, 10, 15, 20],
}
param_combinations = list(itertools.product(*parameters.values()))
tuning_parameters = [
    {"maxIter": maxIter, "regParam": regParam, "rank": rank}
    for maxIter, regParam, rank in param_combinations
]

In [None]:
corresponding_rmse, best_mae, best_parameters = float("inf"), float("inf"), None

for parameters_combination in tuning_parameters:
    print(f"Parameters: {parameters_combination}")
    model = train_model(**parameters_combination)
    rmse, mae = get_metrics(validation, model)
    print("-----------------------------------------")
    if mae < best_mae:
        best_mae = mae
        corresponding_rmse = rmse
        best_parameters = parameters_combination

Parameters: {'maxIter': 5, 'regParam': 0.001, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.3301270190120436


                                                                                

Mean absolute error = 3.7500139474868774
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.001, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.616422169189436


                                                                                

Mean absolute error = 4.569283176213503
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.001, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.919525251224768


                                                                                

Mean absolute error = 2.559095799922943
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.001, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.0146970861483835


                                                                                

Mean absolute error = 3.9842836409807205
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.001, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.672992896437066


                                                                                

Mean absolute error = 3.6547483801841736
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.330127022468075


                                                                                

Mean absolute error = 3.75008761882782
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.4846874171365485


                                                                                

Mean absolute error = 4.376429155468941
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.9121950187173624


                                                                                

Mean absolute error = 2.6324115097522736
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.9362778434071273


                                                                                

Mean absolute error = 3.9068188816308975
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.6034940870812426


                                                                                

Mean absolute error = 3.5925195515155792
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.330131940579437


                                                                                

Mean absolute error = 3.753264307975769
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.481431190142691


                                                                                

Mean absolute error = 4.400399953126907
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.913106860372624


                                                                                

Mean absolute error = 2.901776134967804
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.787881158185574


                                                                                

Mean absolute error = 3.743252396583557
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.4276461778821967


                                                                                

Mean absolute error = 3.423447072505951
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.330127023664476


                                                                                

Mean absolute error = 3.750101327896118
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.578418372064778


                                                                                

Mean absolute error = 4.517133224755526
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.911878563214334


                                                                                

Mean absolute error = 2.5620445907115936
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.014912344807674


                                                                                

Mean absolute error = 3.9846118688583374
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.6589380853726627


                                                                                

Mean absolute error = 3.6419811695814133
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.330127405259571


                                                                                

Mean absolute error = 3.7509145736694336
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.48577939119411


                                                                                

Mean absolute error = 4.3827923610806465
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.879065321215872


                                                                                

Mean absolute error = 2.6633324921131134
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.908072406941558


                                                                                

Mean absolute error = 3.8844280689954758
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.564824781816539


                                                                                

Mean absolute error = 3.557658702135086
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.330133835854405


                                                                                

Mean absolute error = 3.753841757774353
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.439127278749827


                                                                                

Mean absolute error = 4.366970852017403
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.8019069809063755


                                                                                

Mean absolute error = 2.795178174972534
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.7091877280803582


                                                                                

Mean absolute error = 3.664081960916519
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.293768541273291


                                                                                

Mean absolute error = 3.283213973045349
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.330127035370128


                                                                                

Mean absolute error = 3.750188708305359
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.537779454769281


                                                                                

Mean absolute error = 4.457998722791672
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.9047901891394243


                                                                                

Mean absolute error = 2.5650973469018936
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.028026808481792


                                                                                

Mean absolute error = 3.997424766421318
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.6437341128662077


                                                                                

Mean absolute error = 3.6280564665794373
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.3301284393981225


                                                                                

Mean absolute error = 3.7517536878585815
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.522487825400155


                                                                                

Mean absolute error = 4.444125659763813
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.847764859347703


                                                                                

Mean absolute error = 2.6793360710144043
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.8856750235006654


                                                                                

Mean absolute error = 3.8689254373311996
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.5263073218585816


                                                                                

Mean absolute error = 3.5216084420681
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1, 'rank': 1}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.330202602275972


                                                                                

Mean absolute error = 3.7627923488616943
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1, 'rank': 5}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 4.432775306737427


                                                                                

Mean absolute error = 4.368162877857685
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1, 'rank': 10}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 2.823930363401826


                                                                                

Mean absolute error = 2.7997756600379944
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1, 'rank': 15}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.6516932437752825


                                                                                

Mean absolute error = 3.6017852127552032
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1, 'rank': 20}


                                                                                

Predictions count: 4


                                                                                

Root-mean-square error = 3.2094589950228927




Mean absolute error = 3.1920211017131805
-----------------------------------------


                                                                                

In [None]:
print(f"Best parameters: {best_parameters}")
print(f"Best MAE: {mae}")
print(f"RMSE corresponding to the best MAE: {corresponding_rmse}")

Best parameters: {'maxIter': 5, 'regParam': 0.001, 'rank': 10}
Best MAE: 3.1920211017131805
RMSE corresponding to the best MAE: 2.919525251224768


24/05/21 03:45:56 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 876097 ms exceeds timeout 120000 ms
24/05/21 03:45:56 WARN SparkContext: Killing executors is not supported by current scheduler.
24/05/21 03:45:59 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$