In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
import itertools

In [2]:
spark = (
    SparkSession.builder.appName("Collaborative Filtering")  # type: ignore
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    )
    .getOrCreate()
)

24/05/15 01:22:27 WARN Utils: Your hostname, Davids-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.4 instead (on interface en0)
24/05/15 01:22:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/david/.ivy2/cache
The jars for the packages stored in: /Users/david/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d69abec9-c036-4f84-8968-c30451dbcd91;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/david/eafit/proyecto-integrador-semestre-2/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 122ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-d69abec9-c036-4f84-8968-c30451dbcd91
	confs: [default]


In [3]:
spark

In [4]:
# Leer el archivo Parquet
data = spark.read.parquet("s3a://amazon-reviews-eafit/sample-for-model/")

24/05/15 01:22:30 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [5]:
data.count()

                                                                                

4184

In [6]:
indexer = StringIndexer(inputCol="product_id", outputCol="item_id")

data = indexer.fit(data).transform(data)

                                                                                

## Split de la data

Se asegura que haya un dato en test, uno en validation y uno en training


In [7]:
def split_data(data):
    windowSpec = Window.partitionBy("customer_id").orderBy(col("review_date").desc())
    data = data.withColumn("index", row_number().over(windowSpec))

    training = data.where(col("index") >= 3)
    validation = data.where(col("index") == 2)
    test = data.where(col("index") <= 1)

    return training, validation, test

In [8]:
training, validation, test = split_data(data)

training_count = training.count()
validation_count = validation.count()
test_count = test.count()

print(f"Training count: {training_count}")
print(f"Validation count: {validation_count}")
print(f"Test count: {test_count}")

24/05/15 01:22:40 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors

Training count: 3184
Validation count: 500
Test count: 500


                                                                                

In [9]:
def train_model(maxIter=5, regParam=0.01):
    als = ALS(
        maxIter=maxIter,
        regParam=regParam,
        userCol="customer_id",
        itemCol="item_id",
        ratingCol="star_rating",
        seed=42,
        nonnegative=True,
        rank=1,
        coldStartStrategy="drop",
    )
    model = als.fit(training)
    return model

In [10]:
model = train_model()

24/05/15 01:22:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/15 01:22:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [11]:
def get_metrics(dataset, model):
    predictions = model.transform(dataset)

    print(f"Predictions count: {predictions.count()}")
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="star_rating", predictionCol="prediction"
    )
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error = {rmse}")

    evaluator_mae = RegressionEvaluator(
        metricName="mae", labelCol="star_rating", predictionCol="prediction"
    )
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean absolute error = {mae}")

    return rmse, mae

In [12]:
get_metrics(test, model)

Predictions count: 20


                                                                                

Root-mean-square error = 3.434845378540311
Mean absolute error = 2.800789248943329


(3.434845378540311, 2.800789248943329)

In [13]:
get_metrics(validation, model)

                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.472402990144845




Mean absolute error = 2.692521149771554


                                                                                

(3.472402990144845, 2.692521149771554)

In [14]:
parameters = {"maxIter": [5, 10, 15], "regParam": [0.001, 0.01, 0.1]}
param_combinations = list(itertools.product(*parameters.values()))
tuning_parameters = [
    {"maxIter": maxIter, "regParam": regParam}
    for maxIter, regParam in param_combinations
]

In [15]:
corresponding_rmse, best_mae, best_parameters = float("inf"), float("inf"), None

for parameters_combination in tuning_parameters:
    print(f"Parameters: {parameters_combination}")
    model = train_model(**parameters_combination)
    rmse, mae = get_metrics(validation, model)
    print("-----------------------------------------")
    if mae < best_mae:
        best_mae = mae
        corresponding_rmse = rmse
        best_parameters = parameters_combination

Parameters: {'maxIter': 5, 'regParam': 0.001}


                                                                                

Predictions count: 35




Root-mean-square error = 3.4723268654666444


                                                                                

Mean absolute error = 2.6864193643842427
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01}


                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.472402990144845


                                                                                

Mean absolute error = 2.692521149771554
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1}


                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.4811227831561924


                                                                                

Mean absolute error = 2.7380862985338483
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001}


                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.472320518248987


                                                                                

Mean absolute error = 2.68704218183245
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01}


                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.472802718138904


                                                                                

Mean absolute error = 2.6977427005767822
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1}


                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.4884195025409297


                                                                                

Mean absolute error = 2.752918829236712
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001}


                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.4723185102317573


                                                                                

Mean absolute error = 2.687655360358102
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01}


                                                                                

Predictions count: 35


                                                                                

Root-mean-square error = 3.473400359535379


                                                                                

Mean absolute error = 2.702267060961042
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1}


                                                                                

Predictions count: 35
Root-mean-square error = 3.4929537529881416


[Stage 2323:>                                                       (0 + 2) / 2]

Mean absolute error = 2.759358583177839
-----------------------------------------


                                                                                

In [16]:
print(f"Best parameters: {best_parameters}")
print(f"Best MAE: {mae}")
print(f"RMSE corresponding to the best MAE: {corresponding_rmse}")

Best parameters: {'maxIter': 5, 'regParam': 0.001}
Best MAE: 2.759358583177839
RMSE corresponding to the best MAE: 3.4723268654666444
