There are three main types of Recommender Systems:

1. Collaborative Filtering: Based on user-item interactions.
- User-based filtering: Recommends items liked by similar users.
- Item-based filtering: Recommends similar items to what a user has interacted with.
Techniques: k-Nearest Neighbors (k-NN), Matrix Factorization (SVD, ALS).
2. Content-Based Filtering: Recommends items based on item features (e.g., genre, description).
- Uses TF-IDF, word embeddings (e.g., Word2Vec, BERT), or deep learning.
3. Hybrid Approaches: Combines collaborative and content-based filtering.
Example: Netflix recommends movies based on both user preferences and movie metadata.

In [1]:
import sys
sys.executable

'/opt/bitnami/python/bin/python3'

In [2]:
# Choose the ipykernel to the jupyter running in your spark-master node
import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, posexplode, col
from pyspark.sql.functions import broadcast
from pyspark import SparkFiles
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import (
    StructField,
    IntegerType,
    DoubleType,
    StringType,
    StructType,
)

In [3]:
spark = SparkSession.builder.appName("moviesOneMillion").master("spark://spark-master:7077").getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/24 04:01:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read data

Data is shared with the docker through shared volume mapping (data folder)

In [4]:
rating_schema = StructType([StructField("userId", IntegerType(), False), 
                            StructField("movieId", IntegerType(), True), 
                            StructField("rating" , IntegerType(), True),
                            StructField("timestamp", IntegerType(), True)])
rating = spark.read.csv("/data/ml-1m/ratings.dat", sep="::", schema=rating_schema)

rating.show(2)

                                                                                

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
+------+-------+------+---------+
only showing top 2 rows



25/03/24 13:43:31 ERROR TaskSchedulerImpl: Lost executor 1 on 172.19.0.5: worker lost: Not receiving heartbeat for 60 seconds
25/03/24 13:43:31 ERROR TaskSchedulerImpl: Lost executor 0 on 172.19.0.4: worker lost: Not receiving heartbeat for 60 seconds
25/03/25 16:40:55 ERROR TaskSchedulerImpl: Lost executor 2 on 172.19.0.4: worker lost: Not receiving heartbeat for 60 seconds
25/03/25 16:40:55 ERROR TaskSchedulerImpl: Lost executor 3 on 172.19.0.5: worker lost: Not receiving heartbeat for 60 seconds


In [5]:
movies_schema = StructType(
    [
        StructField("id", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("genres", StringType(), True),
    ]
)
movies = spark.read.csv(f"/data/ml-1m/movies.dat", sep="::", schema=movies_schema)
movies.show(2)

+---+----------------+--------------------+
| id|           title|              genres|
+---+----------------+--------------------+
|  1|Toy Story (1995)|Animation|Childre...|
|  2|  Jumanji (1995)|Adventure|Childre...|
+---+----------------+--------------------+
only showing top 2 rows



In [None]:
users_schema = StructType([StructField("id", IntegerType(), False), 
                           StructField("gender", StringType(), True),
                           StructField("age", IntegerType(), True),
                           StructField("occupation", StringType(), True),
                           StructField("zip", IntegerType(), True)])
users = spark.read.schema(schema=users_schema).csv(
    f"/data/ml-1m//users.dat", sep="::", schema=users_schema)
users.show(2)

+---+------+---+----------+-----+
| id|gender|age|occupation|  zip|
+---+------+---+----------+-----+
|  1|     F|  1|        10|48067|
|  2|     M| 56|        16|70072|
+---+------+---+----------+-----+
only showing top 2 rows



                                                                                

# ALS Recommender
ALS aims to find two matrices, 
- User Matrix (U): Represents users as latent (hidden) feature vectors.
- Item Matrix (V): Represents items as latent feature vectors.</br>
their product (U * P) approximates the original user-item rating matrix (R) </br>
ALS fixes one matrix (e.g., U) and solves for the other (V), then alternates. 
ALS minimizes the error between predicted ratings (U × Vᵀ) and actual ratings

In [8]:
(train_data, test_data) = rating.randomSplit([0.8, 0.2], seed=42)

In [None]:
als = ALS(
    userCol="userId", 
    itemCol="movieId", 
    ratingCol="rating", 
    maxIter=10, # more iterations, more accurate, but slower and more prone to overfitting
    regParam=0.1, # regularization parameter to prevent overfitting
    rank=10, # number of latent factors
    coldStartStrategy="drop" # drop users and items with less than 20 ratings
)

model = als.fit(train_data) 

                                                                                

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

# Make predictions
predictions = model.transform(test_data)

# Evaluate RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse:.4f}")

                                                                                

RMSE: 0.8683


In [20]:
user_recs = model.recommendForAllUsers(3)
user_recs.select(
    "userId",
    col("recommendations")[0]["movieId"].alias("recommendation1"),
    col("recommendations")[0]["rating"].alias("recommendation1_rating"),
    col("recommendations")[1]["movieId"].alias("recommendation2"),
    col("recommendations")[1]["rating"].alias("recommendation2_rating"),
    col("recommendations")[2]["movieId"].alias("recommendation3"),
    col("recommendations")[2]["rating"].alias("recommendation3_rating"),
).show(3)



+------+---------------+----------------------+---------------+----------------------+---------------+----------------------+
|userId|recommendation1|recommendation1_rating|recommendation2|recommendation2_rating|recommendation3|recommendation3_rating|
+------+---------------+----------------------+---------------+----------------------+---------------+----------------------+
|     1|           3233|              4.598261|            128|             4.5023494|            527|             4.4881306|
|    12|           2309|             4.6853147|            598|             4.4963436|           1039|             4.4958973|
|    22|           2309|              4.366799|            989|             4.0954137|           1169|               4.07395|
+------+---------------+----------------------+---------------+----------------------+---------------+----------------------+
only showing top 3 rows



                                                                                

In [24]:
user_recs_ex = user_recs.select(
    "userId",
    posexplode("recommendations").alias("pos", "rec")
).select("userId", "pos", col("rec.movieId").alias("movieId"), col("rec.rating").alias("rating"))
user_recs_ex.show(6)




+------+---+-------+---------+
|userId|pos|movieId|   rating|
+------+---+-------+---------+
|     1|  0|   3233| 4.598261|
|     1|  1|    128|4.5023494|
|     1|  2|    527|4.4881306|
|    12|  0|   2309|4.6853147|
|    12|  1|    598|4.4963436|
|    12|  2|   1039|4.4958973|
+------+---+-------+---------+
only showing top 6 rows



                                                                                

In [None]:
user_rec_with_movie_names = user_recs_ex.join(
    broadcast(movies), # each node performs operations locally using broadcasted DF without shuffle data across the network
    col("movieId") == col("id"), "inner"
)
user_rec_with_movie_names.show(6)

                                                                                

+------+---+-------+---------+----+--------------------+-----------+
|userId|pos|movieId|   rating|  id|               title|     genres|
+------+---+-------+---------+----+--------------------+-----------+
|     1|  0|   3233| 4.598261|3233|Smashing Time (1967)|     Comedy|
|     1|  1|    128|4.5023494| 128|Jupiter's Wife (1...|Documentary|
|     1|  2|    527|4.4881306| 527|Schindler's List ...|  Drama|War|
|    12|  0|   2309|4.6853147|2309|Inheritors, The (...|      Drama|
|    12|  1|    598|4.4963436| 598|Window to Paris (...|     Comedy|
|    12|  2|   1039|4.4958973|1039|Synthetic Pleasur...|Documentary|
+------+---+-------+---------+----+--------------------+-----------+
only showing top 6 rows

