In [71]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, rank, expr
from pyspark.sql.window import Window
import itertools

In [72]:
spark = (
    SparkSession.builder.appName("Collaborative Filtering")  # type: ignore
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    )
    .getOrCreate()
)

In [73]:
# Leer el archivo Parquet
data = spark.read.parquet("s3a://amazon-reviews-eafit/sample-for-model/")

In [74]:
data.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- star_rating: float (nullable = true)
 |-- category: string (nullable = true)
 |-- review_date: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_id: string (nullable = true)



In [75]:
data.count()

                                                                                

5366

In [76]:
indexer = StringIndexer(inputCol="product_id", outputCol="item_id")

data = indexer.fit(data).transform(data)

                                                                                

## Split de la data

Se hace un sampleo por customer id. Se toma el 80% de los customer id para el train y el 20% restante para el test.
Se repite el proceso para obtener el conjunto de validación partiendo de la data de training.


In [77]:
def split_data(data, percent_items_to_mask=0.2):
    user_window = Window.partitionBy("customer_id").orderBy(col("product_id").desc())
    data_processed = data.withColumn(
        "number_of_products", expr("count(*) over (partition by customer_id)")
    )
    data_processed = data_processed.withColumn(
        "number_of_products_to_mask",
        (col("number_of_products") * percent_items_to_mask).cast("int"),
    )
    data_processed = data_processed.withColumn("product_rank", rank().over(user_window))

    training = data_processed.filter(
        col("product_rank") > col("number_of_products_to_mask")
    )
    test = data_processed.filter(
        col("product_rank") <= col("number_of_products_to_mask")
    )

    return training, test

In [78]:
training, test = split_data(data, percent_items_to_mask=0.2)
training, validation = split_data(training, percent_items_to_mask=0.2)

training_count = training.count()
validation_count = validation.count()
test_count = test.count()

print(f"Training count: {training_count}")
print(f"Validation count: {validation_count}")
print(f"Test count: {test_count}")



Training count: 3844
Validation count: 669
Test count: 853


                                                                                

In [79]:
def train_model(maxIter=5, regParam=0.1):
    als = ALS(
        maxIter=maxIter,
        regParam=regParam,
        userCol="customer_id",
        itemCol="item_id",
        ratingCol="star_rating",
        seed=42,
        nonnegative=True,
        rank=1,
        coldStartStrategy="drop",
    )
    model = als.fit(training)
    return model

In [80]:
model = train_model()

                                                                                

In [81]:
def get_metrics(dataset, model):
    predictions = model.transform(dataset)

    print(f"Predictions count: {predictions.count()}")
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="star_rating", predictionCol="prediction"
    )
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error = {rmse}")

    evaluator_mae = RegressionEvaluator(
        metricName="mae", labelCol="star_rating", predictionCol="prediction"
    )
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean absolute error = {mae}")

    return rmse, mae

In [82]:
get_metrics(test, model)

                                                                                

Predictions count: 9


                                                                                

Root-mean-square error = 3.801666973268788


[Stage 5665:>                                                       (0 + 2) / 2]

Mean absolute error = 3.497726069556342


                                                                                

(3.801666973268788, 3.497726069556342)

In [83]:
get_metrics(validation, model)

                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 2.9893307839895487




Mean absolute error = 2.451657517751058


                                                                                

(2.9893307839895487, 2.451657517751058)

In [84]:
parameters = {"maxIter": [5, 10, 15], "regParam": [0.001, 0.01, 0.1]}
param_combinations = list(itertools.product(*parameters.values()))
tuning_parameters = [
    {"maxIter": maxIter, "regParam": regParam}
    for maxIter, regParam in param_combinations
]

In [85]:
corresponding_rmse, best_mae, best_parameters = float("inf"), float("inf"), None

for parameters_combination in tuning_parameters:
    print(f"Parameters: {parameters_combination}")
    model = train_model(**parameters_combination)
    rmse, mae = get_metrics(validation, model)
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    print("-----------------------------------------")
    if mae < best_mae:
        best_mae = mae
        corresponding_rmse = rmse
        best_parameters = parameters_combination

Parameters: {'maxIter': 5, 'regParam': 0.001}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 3.125441958371241


                                                                                

Mean absolute error = 2.5933586517969767
MAE: 2.5933586517969767
RMSE: 3.125441958371241
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 3.1052873205422076


                                                                                

Mean absolute error = 2.576060716311137
MAE: 2.576060716311137
RMSE: 3.1052873205422076
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 2.9893307839895487


                                                                                

Mean absolute error = 2.451657517751058
MAE: 2.451657517751058
RMSE: 2.9893307839895487
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 3.129665271812016


                                                                                

Mean absolute error = 2.609744413693746
MAE: 2.609744413693746
RMSE: 3.129665271812016
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 3.0813507547993453


                                                                                

Mean absolute error = 2.564514168103536
MAE: 2.564514168103536
RMSE: 3.0813507547993453
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 2.9506697603609444


                                                                                

Mean absolute error = 2.3761456489562987
MAE: 2.3761456489562987
RMSE: 2.9506697603609444
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 3.135276860520958


                                                                                

Mean absolute error = 2.6240694920221963
MAE: 2.6240694920221963
RMSE: 3.135276860520958
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 3.071190668477704


                                                                                

Mean absolute error = 2.560479100545247
MAE: 2.560479100545247
RMSE: 3.071190668477704
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1}


                                                                                

Predictions count: 15


                                                                                

Root-mean-square error = 2.935168262547622


[Stage 7845:>                                                       (0 + 2) / 2]

Mean absolute error = 2.359368101755778
MAE: 2.359368101755778
RMSE: 2.935168262547622
-----------------------------------------


                                                                                

In [86]:
print(f"Best parameters: {best_parameters}")
print(f"Best MAE: {mae}")
print(f"RMSE corresponding to the best MAE: {corresponding_rmse}")

Best parameters: {'maxIter': 15, 'regParam': 0.1}
Best MAE: 2.359368101755778
RMSE corresponding to the best MAE: 2.935168262547622
