In [31]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
import itertools

In [32]:
spark = (
    SparkSession.builder.appName("Collaborative Filtering")  # type: ignore
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    )
    .getOrCreate()
)

In [33]:
spark

In [34]:
# Leer el archivo Parquet
data = spark.read.parquet("s3a://amazon-reviews-eafit/sample-for-model/")

In [35]:
data.count()

                                                                                

4530

In [36]:
indexer = StringIndexer(inputCol="product_id", outputCol="item_id")

data = indexer.fit(data).transform(data)

                                                                                

In [37]:
windowSpec = Window.partitionBy("customer_id").orderBy(col("review_date").desc())
data = data.withColumn("index", row_number().over(windowSpec))

training = data.where(col("index") >= 3)
validation = data.where(col("index") == 2)
test = data.where(col("index") <= 1)

In [38]:
training_count = training.count()
validation_count = validation.count()
test_count = test.count()

print(f"Training count: {training_count}")
print(f"Validation count: {validation_count}")
print(f"Test count: {test_count}")



Training count: 3530
Validation count: 500
Test count: 500


                                                                                

In [39]:
def train_model(maxIter=5, regParam=0.01):
    als = ALS(
        maxIter=maxIter,
        regParam=regParam,
        userCol="customer_id",
        itemCol="item_id",
        ratingCol="star_rating",
        seed=42,
        nonnegative=True,
        rank=1,
        coldStartStrategy="drop",
    )
    model = als.fit(training)
    return model

In [40]:
model = train_model()

                                                                                

In [41]:
def get_metrics(dataset, model):
    predictions = model.transform(dataset)

    predictions.show()

    print(f"Predictions count: {predictions.count()}")
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="star_rating", predictionCol="prediction"
    )
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error = {rmse}")

    evaluator_mae = RegressionEvaluator(
        metricName="mae", labelCol="star_rating", predictionCol="prediction"
    )
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean absolute error = {mae}")

    return rmse, mae

In [42]:
get_metrics(test, model)

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|    2205788|B00B2V66VS|        5.0|         mobile_apps| 2013-10-06|                Y|   12.0|    1|  4.992092|
|   10135689|B00HZ3C4N6|        5.0|digital_video_dow...| 2015-07-05|                Y|  224.0|    1| 5.0102386|
|   10350879|B00FAPF5U0|        5.0|         mobile_apps| 2015-08-19|                Y|    0.0|    1| 5.0374856|
|   10968055|B00449U3K0|        4.0|            wireless| 2015-06-15|                Y|   87.0|    1|       0.0|
|   11715394|B00LMQVTU6|        5.0|                toys| 2015-05-30|                Y|  247.0|    1|       0.0|
|   13251775|B000068PBT|        5.0|              beauty| 2015-07-17|                Y|   30.0| 

                                                                                

Root-mean-square error = 3.183111906170139
Mean absolute error = 2.3672051555232


                                                                                

(3.183111906170139, 2.3672051555232)

In [43]:
get_metrics(validation, model)

                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2|  4.992092|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2|  4.997807|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2|  4.997668|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Root-mean-square error = 3.416342813933422
Mean absolute error = 2.5102251672744753


                                                                                

(3.416342813933422, 2.5102251672744753)

In [44]:
parameters = {"maxIter": [5, 10, 15], "regParam": [0.001, 0.01, 0.1]}
param_combinations = list(itertools.product(*parameters.values()))
tuning_parameters = [
    {"maxIter": maxIter, "regParam": regParam}
    for maxIter, regParam in param_combinations
]

In [45]:
corresponding_rmse, best_mae, best_parameters = float("inf"), float("inf"), None

for parameters_combination in tuning_parameters:
    print(f"Parameters: {parameters_combination}")
    model = train_model(**parameters_combination)
    rmse, mae = get_metrics(validation, model)
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    print("-----------------------------------------")
    if mae < best_mae:
        best_mae = mae
        corresponding_rmse = rmse
        best_parameters = parameters_combination

Parameters: {'maxIter': 5, 'regParam': 0.001}


                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2| 4.9992356|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2|  4.999798|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2|  4.999785|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2|  4.992092|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2|  4.997807|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2|  4.997668|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Predictions count: 50
Root-mean-square error = 3.416342813933422


                                                                                

Mean absolute error = 2.5102251672744753
MAE: 2.5102251672744753
RMSE: 3.416342813933422
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1}


                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2| 4.8923807|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2|  4.961011|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2|  4.958635|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Mean absolute error = 2.4502064180374146
MAE: 2.4502064180374146
RMSE: 3.3560882144612814
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001}


                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2|   4.99867|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2|  4.999796|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2| 4.9997835|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.5483022544871874
Mean absolute error = 2.5963638687133788
MAE: 2.5963638687133788
RMSE: 3.5483022544871874
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01}


                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2|  4.986181|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2|  4.997617|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2|  4.997466|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Mean absolute error = 2.5652522611618043
MAE: 2.5652522611618043
RMSE: 3.4914851117490766
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1}


                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2| 4.7995124|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2|   4.94509|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2| 4.9419565|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Root-mean-square error = 3.356361003874185


                                                                                

Mean absolute error = 2.4568858695030213
MAE: 2.4568858695030213
RMSE: 3.356361003874185
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001}


                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2|  4.998106|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2| 4.9997945|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2|  4.999781|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Root-mean-square error = 3.567456530619152


                                                                                

Mean absolute error = 2.601723771095276
MAE: 2.601723771095276
RMSE: 3.567456530619152
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01}


                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2| 4.9802094|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2| 4.9974284|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2| 4.9972663|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|     348087|B0097EWK3A|        5.0|         mobile_apps| 2015-04-12|                Y|  162.0|    2|       0.0|
|    2205788|B00B2V66VS|        5.0|        multilingual| 2013-10-06|                Y|   12.0|    2|  4.708966|
|    5062598|B00BVYNSLC|        5.0|digital_video_dow...| 2015-08-02|                Y|  195.0|    2| 4.9323997|
|    5972045|B007QUUJMG|        1.0|digital_ebook_pur...| 2013-07-03|                Y|  135.0|    2|       0.0|
|    7483445|B009IRVSVQ|        5.0|               music| 2015-07-20|                Y|  173.0|    2|  4.928829|
|    9864714|B00BR4QNJ0|        5.0|         mobile_apps| 2013-03-29|                Y|  194.0| 

                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.353017782003737
Mean absolute error = 2.4632883882522583
MAE: 2.4632883882522583
RMSE: 3.353017782003737
-----------------------------------------


In [46]:
print(f"Best parameters: {best_parameters}")
print(f"Best MAE: {mae}")
print(f"RMSE corresponding to the best MAE: {corresponding_rmse}")

Best parameters: {'maxIter': 5, 'regParam': 0.1}
Best MAE: 2.4632883882522583
RMSE corresponding to the best MAE: 3.3560882144612814


In [52]:
users = validation.filter(validation.customer_id == 10135689)
predictions = model.transform(users)
predictions.show()



+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|   10135689|B00DFFHUUA|        2.0|digital_video_dow...| 2015-07-05|                Y|  202.0|    2| 1.9656565|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+



                                                                                

In [69]:
result = model.recommendForUserSubset(users, 10)
result.select("recommendations").show(truncate=False)

                                                                                

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|recommendations                                                                                                                                                                                                                                                                                                                                                                                                                                               