In [19]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
import itertools

In [20]:
spark = (
    SparkSession.builder.appName("Collaborative Filtering")  # type: ignore
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    )
    .getOrCreate()
)

In [21]:
spark

In [22]:
# Leer el archivo Parquet
data = spark.read.parquet("s3a://amazon-reviews-eafit/sample-for-model/")

In [23]:
data.count()

                                                                                

4530

In [24]:
indexer = StringIndexer(inputCol="product_id", outputCol="item_id")

data = indexer.fit(data).transform(data)

                                                                                

## Split de la data

Se asegura que haya un dato en test, uno en validation y uno en training


In [25]:
def split_data(data):
    windowSpec = Window.partitionBy("customer_id").orderBy(col("review_date").desc())
    data = data.withColumn("index", row_number().over(windowSpec))

    training = data.where(col("index") >= 3)
    validation = data.where(col("index") == 2)
    test = data.where(col("index") <= 1)

    return training, validation, test

In [26]:
training, validation, test = split_data(data)

training_count = training.count()
validation_count = validation.count()
test_count = test.count()

print(f"Training count: {training_count}")
print(f"Validation count: {validation_count}")
print(f"Test count: {test_count}")



Training count: 3530
Validation count: 500
Test count: 500


                                                                                

In [27]:
def train_model(maxIter=5, regParam=0.01):
    als = ALS(
        maxIter=maxIter,
        regParam=regParam,
        userCol="customer_id",
        itemCol="item_id",
        ratingCol="star_rating",
        seed=42,
        nonnegative=True,
        rank=1,
        coldStartStrategy="drop",
    )
    model = als.fit(training)
    return model

In [28]:
model = train_model()

                                                                                

In [29]:
def get_metrics(dataset, model):
    predictions = model.transform(dataset)

    print(f"Predictions count: {predictions.count()}")
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="star_rating", predictionCol="prediction"
    )
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error = {rmse}")

    evaluator_mae = RegressionEvaluator(
        metricName="mae", labelCol="star_rating", predictionCol="prediction"
    )
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean absolute error = {mae}")

    return rmse, mae

In [30]:
get_metrics(test, model)

Predictions count: 19


                                                                                

Root-mean-square error = 3.183111906170139
Mean absolute error = 2.3672051555232


(3.183111906170139, 2.3672051555232)

In [31]:
get_metrics(validation, model)

                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.416342813933422
Mean absolute error = 2.5102251672744753


                                                                                

(3.416342813933422, 2.5102251672744753)

In [32]:
parameters = {"maxIter": [5, 10, 15], "regParam": [0.001, 0.01, 0.1]}
param_combinations = list(itertools.product(*parameters.values()))
tuning_parameters = [
    {"maxIter": maxIter, "regParam": regParam}
    for maxIter, regParam in param_combinations
]

In [33]:
corresponding_rmse, best_mae, best_parameters = float("inf"), float("inf"), None

for parameters_combination in tuning_parameters:
    print(f"Parameters: {parameters_combination}")
    model = train_model(**parameters_combination)
    rmse, mae = get_metrics(validation, model)
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    print("-----------------------------------------")
    if mae < best_mae:
        best_mae = mae
        corresponding_rmse = rmse
        best_parameters = parameters_combination

Parameters: {'maxIter': 5, 'regParam': 0.001}


                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.427657869332808


                                                                                

Mean absolute error = 2.5188874912261965
MAE: 2.5188874912261965
RMSE: 3.427657869332808
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01}


                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.416342813933422


                                                                                

Mean absolute error = 2.5102251672744753
MAE: 2.5102251672744753
RMSE: 3.416342813933422
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1}


                                                                                

Predictions count: 50
Root-mean-square error = 3.3560882144612814


                                                                                

Mean absolute error = 2.4502064180374146
MAE: 2.4502064180374146
RMSE: 3.3560882144612814
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001}


                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.5483022544871874


                                                                                

Mean absolute error = 2.5963638687133788
MAE: 2.5963638687133788
RMSE: 3.5483022544871874
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01}


                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.4914851117490766


                                                                                

Mean absolute error = 2.5652522611618043
MAE: 2.5652522611618043
RMSE: 3.4914851117490766
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1}


                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.356361003874185


                                                                                

Mean absolute error = 2.4568858695030213
MAE: 2.4568858695030213
RMSE: 3.356361003874185
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001}


                                                                                

Predictions count: 50
Root-mean-square error = 3.567456530619152


                                                                                

Mean absolute error = 2.601723771095276
MAE: 2.601723771095276
RMSE: 3.567456530619152
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01}


                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.4987247743688443


                                                                                

Mean absolute error = 2.5685915517807008
MAE: 2.5685915517807008
RMSE: 3.4987247743688443
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1}


                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.353017782003737
Mean absolute error = 2.4632883882522583
MAE: 2.4632883882522583
RMSE: 3.353017782003737
-----------------------------------------


In [34]:
print(f"Best parameters: {best_parameters}")
print(f"Best MAE: {mae}")
print(f"RMSE corresponding to the best MAE: {corresponding_rmse}")

Best parameters: {'maxIter': 5, 'regParam': 0.1}
Best MAE: 2.4632883882522583
RMSE corresponding to the best MAE: 3.3560882144612814


In [35]:
users = validation.filter(validation.customer_id == 10135689)
predictions = model.transform(users)
predictions.show()

+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|customer_id|product_id|star_rating|            category|review_date|verified_purchase|item_id|index|prediction|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+
|   10135689|B00DFFHUUA|        2.0|digital_video_dow...| 2015-07-05|                Y|  202.0|    2| 1.9656565|
+-----------+----------+-----------+--------------------+-----------+-----------------+-------+-----+----------+



                                                                                

In [36]:
result = model.recommendForUserSubset(users, 10)
result.select("recommendations").show(truncate=False)

                                                                                

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|recommendations                                                                                                                                                                        |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{3166, 6.6092134}, {2527, 6.6092134}, {2387, 6.6092134}, {1500, 6.6092134}, {75, 6.6092134}, {3002, 5.801481}, {2702, 5.801481}, {2700, 5.801481}, {2569, 5.801481}, {2483, 5.801481}]|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

