In [1]:
import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

print("System version: {}".format(sys.version))
print("Spark version: {}".format(pyspark.__version__))

System version: 3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:48:25) 
[Clang 14.0.6 ]
Spark version: 3.3.1


In [2]:
# top k items to recommend
TOP_K = 10

<h1>Set up spark context</h1>

In [3]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="10g")
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/21 05:09:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<h1>Load the MovieLens dataset</h1>

In [4]:
# Column names for the dataset
COL_USER = "UserId"
COL_ITEM = "ItemId"
COL_RATING = "Rating"
COL_TIMESTAMP = "Timestamp"

In [5]:
# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.
schema = StructType(
    (
        StructField(COL_USER, IntegerType()),
        StructField(COL_ITEM, IntegerType()),
        StructField(COL_RATING, FloatType()),
        StructField(COL_TIMESTAMP, LongType()),
    )
)

train = spark.read.load('../data/sas/train.csv', format="csv", header="true", sep=',', schema=schema)
test = spark.read.load('../data/sas/test.csv', format="csv", header="true", sep=',', schema=schema)

In [6]:
print ("N train", train.cache().count())
print ("N test", test.cache().count())

N train 67791
N test 22487


<h1>Train ALS</h1>

In [7]:
header = {
    "userCol": COL_USER,
    "itemCol": COL_ITEM,
    "ratingCol": COL_RATING,
}

als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=False,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

In [8]:
with Timer() as train_time:
    model = als.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

23/02/21 05:09:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/02/21 05:09:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/02/21 05:09:23 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
Took 2.8991458329999986 seconds for training.


- In the movie recommendation use case, recommending movies that have been rated by the users do not make sense. (because they have already seen them)

- Therefore, the rated movies are removed from the recommended items.

- In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset.

In [9]:
with Timer() as test_time:

    # Get the cross join of all user-item pairs and score them.
    users = train.select(COL_USER).distinct()
    items = train.select(COL_ITEM).distinct()
    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item)

    # Remove seen items.
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_ITEM] == train[COL_ITEM]),
        how='outer'
    )

    # train.{col_rating} will be null on the previous step, if this pair was not seen before. These are the entries of interest to us.
    top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_RATING}"].isNull()) \
        .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + "prediction")

    # In Spark, transformations are lazy evaluation
    # Use an action to force execute and measure the test time 
    top_all.cache().count()

print("Took {} seconds for prediction.".format(test_time.interval))

23/02/21 05:09:27 WARN Column: Constructing trivially true equals predicate, 'UserId#0 = UserId#0'. Perhaps you need to use aliases.




Took 16.707953791999998 seconds for prediction.


                                                                                

In [10]:
top_all.show()

+------+------+----------+
|UserId|ItemId|prediction|
+------+------+----------+
|     0|   190|0.89229834|
|     0|   632| 1.0106795|
|     0|   716|0.94243544|
|     0|   915| 1.0021477|
|     0|  1218| 1.0117779|
|     0|  1237| 1.2197762|
|     0|  1265|0.94778615|
|     0|  1327| 1.8397956|
|     0|  1478|0.94713295|
|     0|  1578| 1.0124902|
|     0|  1761| 0.9968778|
|     0|  1790| 1.2600079|
|     0|  1866|0.92909086|
|     0|  2248| 1.0387272|
|     1|   587|  1.405763|
|     1|   869| 1.1937526|
|     1|  1208|0.24591506|
|     1|  1348| 3.1282556|
|     1|  1357| 1.7986736|
|     1|  1677| 1.3892138|
+------+------+----------+
only showing top 20 rows



<h1>Evaluate how well ALS performs</h1>

In [11]:
# Needs all the predictions, to calculate Top-K, NDCG, etc.
rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_ITEM, 
                                    col_rating=COL_RATING, col_prediction="prediction", 
                                    relevancy_method="top_k")

                                                                                

In [12]:
print("Model:\tALS",
      "Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')

[Stage 503:>                                                        (0 + 8) / 9]

Model:	ALS
Top K:	10
MAP:	0.001836
NDCG:	0.004587
Precision@K:	0.002993
Recall@K:	0.006066


                                                                                

<h1>Evaluate rating prediction</h1>

In [13]:
# Generate predicted ratings.
prediction = model.transform(test)
prediction.cache().show()

                                                                                

+------+------+------+----------+----------+
|UserId|ItemId|Rating| Timestamp|prediction|
+------+------+------+----------+----------+
|  2580|   148|   1.0|1497895920| 1.4229584|
|  2572|   148|   1.0|1497838440| 1.1401534|
|  5490|   148|   1.0|1494433680| 1.4418218|
|  3287|   148|   1.0|1495538100| 1.5095295|
|  3587|   148|   3.0|1498161420| 3.8146977|
|  4526|   148|   1.0|1496456760| 1.0836066|
|   122|   148|   1.0|1497617580| 0.8089223|
|  4280|   148|   1.0|1496850360| 1.6755807|
|  4839|   148|   2.0|1498768860|  2.069603|
|  2824|   148|   1.0|1495063440|  1.634411|
|  2150|   148|   2.0|1497262380| 2.4815428|
|  1901|   148|   6.0|1496875560|  9.204268|
|  2019|   148|   1.0|1497797160| 0.9409647|
|  5470|   148|   4.0|1498070340|  2.799487|
|  1720|   148|   2.0|1496330760| 1.9125175|
|  2598|   148|   1.0|1496922600| 1.2166462|
|   900|   148|   6.0|1497556140| 5.4827785|
|   794|   148|   1.0|1498646520| 3.5094445|
|  3202|   148|   1.0|1498859820| 1.5548692|
|  4156|  

In [14]:
rating_eval = SparkRatingEvaluation(test, prediction, col_user=COL_USER, col_item=COL_ITEM, 
                                    col_rating=COL_RATING, col_prediction="prediction")

print("Model:\tALS rating prediction",
      "RMSE:\t%f" % rating_eval.rmse(),
      "MAE:\t%f" % rating_eval.mae(),
      "Explained variance:\t%f" % rating_eval.exp_var(),
      "R squared:\t%f" % rating_eval.rsquared(), sep='\n')

                                                                                

Model:	ALS rating prediction
RMSE:	1.841608
MAE:	0.765783
Explained variance:	0.476278
R squared:	0.464402


In [15]:
if not is_jupyter():
    # Record results with papermill for tests
    import scrapbook as sb
    sb.glue("map", rank_eval.map_at_k())
    sb.glue("ndcg", rank_eval.ndcg_at_k())
    sb.glue("precision", rank_eval.precision_at_k())
    sb.glue("recall", rank_eval.recall_at_k())
    sb.glue("rmse", rating_eval.rmse())
    sb.glue("mae", rating_eval.mae())
    sb.glue("exp_var", rating_eval.exp_var())
    sb.glue("rsquared", rating_eval.rsquared())
    sb.glue("train_time", train_time.interval)
    sb.glue("test_time", test_time.interval)

In [16]:
# cleanup spark instance
spark.stop()