In [1]:
!pip install pyspark numpy

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
 .appName("MovieRecommender") \
 .master("spark://spark-master:7077") \
 .config("spark.jars.packages", "org.postgresql:postgresql:42.7.1") \
 .config("spark.driver.memory", "2g") \
 .config("spark.executor.memory", "3g") \
 .config("spark.driver.host", "recommender-jupyter") \
 .config("spark.driver.bindAddress", "0.0.0.0") \
 .getOrCreate()

jdbc_url = "jdbc:postgresql://postgres:5432/recommender"
properties = {
    "user": "recommender",
    "password": "recommender",
    "driver": "org.postgresql.Driver"
}

:: loading settings :: url = jar:file:/app/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a2e4e618-82ea-4514-8445-d95e541fdbc2;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.1 in central
	found org.checkerframework#checker-qual;3.41.0 in central
:: resolution report :: resolve 58ms :: artifacts dl 2ms
	:: modules in use:
	org.checkerframework#checker-qual;3.41.0 from central in [default]
	org.postgresql#postgresql;42.7.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-----------------------------------------

In [3]:
# Load tables
ratings = spark.read.jdbc(
    jdbc_url, "movielens.ratings", properties=properties,
    column="user_id", lowerBound=1, upperBound=300000, numPartitions=10
)
movies = spark.read.jdbc(jdbc_url, "movielens.movies", properties=properties)

ratings.show(5)
movies.show(10)

[Stage 0:>                                                          (0 + 1) / 1]

+-------+--------+------+-------------------+
|user_id|movie_id|rating|   rating_timestamp|
+-------+--------+------+-------------------+
|    441|    5418|   3.5|2005-02-15 23:53:58|
|    441|    5464|   3.5|2005-07-05 19:14:13|
|    441|    5989|   3.5|2005-01-27 20:06:56|
|    441|    5995|   4.5|2005-01-27 19:56:22|
|    441|    6539|   2.5|2005-01-27 20:00:11|
+-------+--------+------+-------------------+
only showing top 5 rows
+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Adventure|Animati...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)| 

                                                                                

In [4]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Prepare data
(training, test) = ratings.randomSplit([0.7, 0.3], seed=42)

# Build ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="user_id",
    itemCol="movie_id",
    ratingCol="rating",
    coldStartStrategy="drop"
)

model = als.fit(training)

# Evaluate
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}") # jak interpretować wartość

[Stage 109:>                                                        (0 + 2) / 2]

RMSE: 0.8081274264270969


                                                                                

In [5]:
# Top 10 recommendations for each user
user_recs = model.recommendForAllUsers(10)
user_recs.show(5, truncate=False)

# jakie filmy ogladal wczesniej a jakie dostal rekomendacje

# Top 10 recommendations for a specific user
user_42_recs = model.recommendForUserSubset(
    spark.createDataFrame([(42,)], ["user_id"]), 10
)

# Join with movie titles
from pyspark.sql.functions import explode

user_42_recs_flat = user_42_recs.select(
    "user_id",
    explode("recommendations").alias("rec")
).select("user_id", "rec.movie_id", "rec.rating")

user_42_recs_flat.join(movies, "movie_id").show(truncate=False)

                                                                                

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                              |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1      |[{120821, 5.371759}, {121029, 5.2287755}, {102119, 5.0840597}, {107623, 5.048971}, {109887, 5.0230913}, {98328, 5.0230913}, {26793, 4.984653}, {116899, 4.969732}, {86401, 4.9678717}, {96631, 4.948954}]    |
|3      |[{121029, 5.8692026}, {120821, 5.8110275}, {116899, 5.5893846}, {109887, 5.568152}, {98328, 5.568152}, {107623, 5.563184}, {102

                                                                                

+--------+-------+---------+--------------------------------------------------------------+--------------------+
|movie_id|user_id|rating   |title                                                         |genres              |
+--------+-------+---------+--------------------------------------------------------------+--------------------+
|121029  |42     |5.4180927|No Distance Left to Run (2010)                                |Documentary         |
|77736   |42     |5.376377 |Crazy Stone (Fengkuang de shitou) (2006)                      |Comedy|Crime        |
|56869   |42     |5.2917137|Drained (O cheiro do Ralo) (2006)                             |Comedy              |
|110173  |42     |5.182763 |Wolf (2013)                                                   |Crime|Drama|Thriller|
|130347  |42     |5.0884967|Bill Hicks: Sane Man (1989)                                   |Comedy              |
|108713  |42     |5.070996 |Tomorrow Night (1998)                                         |Comed

In [6]:
from pyspark.sql.functions import explode, col

movie_recs = model.recommendForItemSubset(
    spark.createDataFrame([(1,)], ["movie_id"]), 10
)

# Check the schema
movie_recs.printSchema()

# Explode correctly - it's user_id, not movie_id
movie_recs_flat = movie_recs.select(
    col("movie_id"),
    explode("recommendations").alias("rec")
).select(
    "movie_id",
    col("rec.user_id").alias("user_id"),
    col("rec.rating").alias("score")
)

movie_recs_flat.show()

root
 |-- movie_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- user_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)

+--------+-------+---------+
|movie_id|user_id|    score|
+--------+-------+---------+
|       1|  27735|  5.61217|
|       1|  99259|  5.54677|
|       1| 108993| 5.512836|
|       1|  76693| 5.466339|
|       1|  53212|5.4587703|
|       1| 129211| 5.441646|
|       1|  30542| 5.427482|
|       1| 119513|5.3881965|
|       1|  61498| 5.371231|
|       1| 110387| 5.367866|
+--------+-------+---------+



In [7]:
!pip install numpy

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
from pyspark.ml.feature import Normalizer
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import Vectors
import numpy as np

# Get item factors
item_factors = model.itemFactors

# Get Toy Story's factor vector
toy_story_factor = item_factors.filter(col("id") == 1).select("features").collect()[0][0]

# Calculate cosine similarity
def cosine_sim(v):
    a = np.array(toy_story_factor)
    b = np.array(v)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

cosine_udf = udf(cosine_sim, DoubleType())

# Find similar movies
similar_movies = item_factors \
    .withColumn("similarity", cosine_udf(col("features"))) \
    .filter(col("id") != 1) \
    .orderBy(col("similarity").desc()) \
    .limit(10) \
    .withColumnRenamed("id", "movie_id")

# Join with movie titles
similar_movies.join(movies, "movie_id") \
    .select("movie_id", "title", "similarity") \
    .show(truncate=False)

26/01/29 05:45:41 WARN TaskSetManager: Lost task 0.0 in stage 384.0 (TID 517) (172.18.0.5 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 3369, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 3276, in read_udfs
    read_single_udf(
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1305, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker_util.py", line 64, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", 

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 3369, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 3276, in read_udfs
    read_single_udf(
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1305, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker_util.py", line 64, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 173, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 473, in loads
    return cloudpickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/cloudpickle/cloudpickle.py", line 469, in subimport
    __import__(name)
ModuleNotFoundError: No module named 'numpy'
