In [73]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

In [74]:
spark = (
    SparkSession.builder.appName("Collaborative Filtering")  # type: ignore
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    )
    .getOrCreate()
)

In [75]:
spark

In [76]:
# Leer el archivo Parquet
data = spark.read.parquet("s3a://amazon-reviews-eafit/sample-for-model/")

In [77]:
data.count()

                                                                                

4530

In [78]:
indexer = StringIndexer(inputCol="product_id", outputCol="item_id")

data = indexer.fit(data).transform(data)

                                                                                

In [79]:
windowSpec = Window.partitionBy("customer_id").orderBy(col("review_date").desc())
data = data.withColumn("index", row_number().over(windowSpec))

training = data.where(col("index") >= 3)
validation = data.where(col("index") == 2)
test = data.where(col("index") <= 1)

In [80]:
training_count = training.count()
validation_count = validation.count()
test_count = test.count()

print(f"Training count: {training_count}")
print(f"Validation count: {validation_count}")
print(f"Test count: {test_count}")

[Stage 1193:>                                                       (0 + 2) / 2]

Training count: 3530
Validation count: 500
Test count: 500


                                                                                

In [81]:
# Build the recommendation model using ALS on the training data
als = ALS(
    maxIter=5,
    regParam=0.01,
    userCol="customer_id",
    itemCol="item_id",
    ratingCol="star_rating",
    seed=42,
    nonnegative=True,
    rank=1,
    coldStartStrategy="drop",
)
model = als.fit(training)

                                                                                

In [82]:
def get_metrics(dataset):
    predictions = model.transform(dataset)

    print(f"Predictions count: {predictions.count()}")
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="star_rating", predictionCol="prediction"
    )
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error = {rmse}")

    evaluator_mae = RegressionEvaluator(
        metricName="mae", labelCol="star_rating", predictionCol="prediction"
    )
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean absolute error = {mae}")

    return rmse, mae

In [83]:
get_metrics(test)

                                                                                

Predictions count: 19
Root-mean-square error = 3.183111906170139
Mean absolute error = 2.3672051555232


(3.183111906170139, 2.3672051555232)

In [84]:
get_metrics(validation)

                                                                                

Predictions count: 50


                                                                                

Root-mean-square error = 3.416342813933422
Mean absolute error = 2.5102251672744753


                                                                                

(3.416342813933422, 2.5102251672744753)