In [30]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, rank, expr
from pyspark.sql.window import Window
import itertools

In [31]:
spark = (
    SparkSession.builder.appName("Collaborative Filtering")  # type: ignore
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    )
    .getOrCreate()
)

In [32]:
# Leer el archivo Parquet
data = spark.read.parquet("s3a://amazon-reviews-eafit/sample-for-model/")

In [33]:
data.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- star_rating: float (nullable = true)
 |-- category: string (nullable = true)
 |-- review_date: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)



In [34]:
data.count()

                                                                                

4184

In [35]:
indexer = StringIndexer(inputCol="product_id", outputCol="item_id")

indexer_model = indexer.fit(data)

inverter = IndexToString(
    inputCol="item_id", outputCol="original_item_id", labels=indexer_model.labels
)

inverter.write().overwrite().save("./inverter")

                                                                                

In [36]:
data = indexer_model.transform(data)

In [37]:
loaded_inverter = IndexToString.load("./inverter")

## Split de la data

Se hace un sampleo por customer id. Se toma el 80% de los customer id para el train y el 20% restante para el test.
Se repite el proceso para obtener el conjunto de validación partiendo de la data de training.


In [38]:
def split_data(data, percent_items_to_mask=0.2):
    user_window = Window.partitionBy("customer_id").orderBy(col("product_id").desc())
    data_processed = data.withColumn(
        "number_of_products", expr("count(*) over (partition by customer_id)")
    )
    data_processed = data_processed.withColumn(
        "number_of_products_to_mask",
        (col("number_of_products") * percent_items_to_mask).cast("int"),
    )
    data_processed = data_processed.withColumn("product_rank", rank().over(user_window))

    training = data_processed.filter(
        col("product_rank") > col("number_of_products_to_mask")
    )
    test = data_processed.filter(
        col("product_rank") <= col("number_of_products_to_mask")
    )

    return training, test

In [39]:
training, test = split_data(data, percent_items_to_mask=0.1)
training, validation = split_data(training, percent_items_to_mask=0.1)

training_count = training.count()
validation_count = validation.count()
test_count = test.count()

print(f"Training count: {training_count}")
print(f"Validation count: {validation_count}")
print(f"Test count: {test_count}")



Training count: 3819
Validation count: 169
Test count: 196


                                                                                

In [40]:
training.select("customer_id").distinct().show()



+-----------+
|customer_id|
+-----------+
|      76286|
|      93131|
|     205422|
|     337529|
|     583137|
|     722290|
|     890674|
|    1176852|
|    1498833|
|    1553047|
|    1671892|
|    1735873|
|    2008691|
|    2039136|
|    2040014|
|    2127453|
|    2156883|
|    2177134|
|    2531809|
|    2561699|
+-----------+
only showing top 20 rows



                                                                                

## Validación con un usuario

Validamos cuántos datos tiene un usuario en total, cuántos quedaron en training, test y validation.


In [41]:
data[data["customer_id"] == 76286].count()

                                                                                

59

In [42]:
training[training["customer_id"] == 76286].count()

                                                                                

49

In [43]:
validation[validation["customer_id"] == 76286].count()

                                                                                

5

In [44]:
test[test["customer_id"] == 76286].count()

                                                                                

5

## Función para entrenar el modelo

Esta función recibe dos parámetros: `maxIter` y `regParam`. Esto debido a que más adelante se hará un grid search para ver la mejor combinación de parámetros.


In [45]:
def train_model(maxIter=5, regParam=0.1):
    als = ALS(
        maxIter=maxIter,
        regParam=regParam,
        userCol="customer_id",
        itemCol="item_id",
        ratingCol="star_rating",
        seed=42,
        nonnegative=True,
        rank=1,
        coldStartStrategy="drop",
    )
    model = als.fit(training)
    return model

In [46]:
model = train_model()

24/05/18 15:32:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/18 15:32:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [47]:
def get_metrics(dataset, model):
    predictions = model.transform(dataset)

    print(f"Predictions count: {predictions.count()}")
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="star_rating", predictionCol="prediction"
    )
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error = {rmse}")

    evaluator_mae = RegressionEvaluator(
        metricName="mae", labelCol="star_rating", predictionCol="prediction"
    )
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean absolute error = {mae}")

    return rmse, mae

In [48]:
get_metrics(test, model)

Predictions count: 3
Root-mean-square error = 4.358898943540674
Mean absolute error = 4.333333333333333


                                                                                

(4.358898943540674, 4.333333333333333)

In [49]:
get_metrics(validation, model)

                                                                                

Predictions count: 3
Root-mean-square error = 2.998836596938872


[Stage 348:>                                                        (0 + 2) / 2]

Mean absolute error = 2.3298346201578775


                                                                                

(2.998836596938872, 2.3298346201578775)

In [50]:
parameters = {"maxIter": [5, 10, 15], "regParam": [0.001, 0.01, 0.1]}
param_combinations = list(itertools.product(*parameters.values()))
tuning_parameters = [
    {"maxIter": maxIter, "regParam": regParam}
    for maxIter, regParam in param_combinations
]

In [51]:
corresponding_rmse, best_mae, best_parameters = float("inf"), float("inf"), None

for parameters_combination in tuning_parameters:
    print(f"Parameters: {parameters_combination}")
    model = train_model(**parameters_combination)
    rmse, mae = get_metrics(validation, model)
    print("-----------------------------------------")
    if mae < best_mae:
        best_mae = mae
        corresponding_rmse = rmse
        best_parameters = parameters_combination

Parameters: {'maxIter': 5, 'regParam': 0.001}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.9999550755487627


                                                                                

Mean absolute error = 2.3331985473632812
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.01}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.999589635741288


                                                                                

Mean absolute error = 2.3321011861165366
-----------------------------------------
Parameters: {'maxIter': 5, 'regParam': 0.1}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.998836596938872


                                                                                

Mean absolute error = 2.3298346201578775
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.001}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.99984523680335


                                                                                

Mean absolute error = 2.3328688939412436
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.01}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.998530045163354


                                                                                

Mean absolute error = 2.3289098739624023
-----------------------------------------
Parameters: {'maxIter': 10, 'regParam': 0.1}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.988458761076085


                                                                                

Mean absolute error = 2.297830899556478
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.001}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.9997355542234994


                                                                                

Mean absolute error = 2.3325395584106445
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.01}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.997462281379828


                                                                                

Mean absolute error = 2.3256794611612954
-----------------------------------------
Parameters: {'maxIter': 15, 'regParam': 0.1}


                                                                                

Predictions count: 3


                                                                                

Root-mean-square error = 2.9773260849300396
Mean absolute error = 2.2617225646972656
-----------------------------------------


                                                                                

In [52]:
print(f"Best parameters: {best_parameters}")
print(f"Best MAE: {mae}")
print(f"RMSE corresponding to the best MAE: {corresponding_rmse}")

Best parameters: {'maxIter': 15, 'regParam': 0.1}
Best MAE: 2.2617225646972656
RMSE corresponding to the best MAE: 2.9773260849300396
