In [1]:
!pip install pyspark



In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MovieRecommender") \
    .master("local[*]") \
    .config("spark.jars", "../dependencies/postgresql-42.7.1.jar") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

jdbc_url = "jdbc:postgresql://localhost:5432/recommender"
properties = {
    "user": "recommender",
    "password": "recommender",
    "driver": "org.postgresql.Driver"
}

# Load tables
ratings = spark.read.jdbc(jdbc_url, "movielens.ratings", properties=properties)
movies = spark.read.jdbc(jdbc_url, "movielens.movies", properties=properties)

ratings.show(5)
movies.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+-------+--------+------+-------------------+
|user_id|movie_id|rating|   rating_timestamp|
+-------+--------+------+-------------------+
|      1|       2|   3.5|2005-04-02 23:53:47|
|      1|      29|   3.5|2005-04-02 23:31:16|
|      1|      32|   3.5|2005-04-02 23:33:39|
|      1|      47|   3.5|2005-04-02 23:32:07|
|      1|      50|   3.5|2005-04-02 23:29:40|
+-------+--------+------+-------------------+
only showing top 5 rows
+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Adventure|Animati...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)| 

                                                                                

In [7]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Prepare data
(training, test) = ratings.randomSplit([0.7, 0.3], seed=42)

# Build ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="user_id",
    itemCol="movie_id",
    ratingCol="rating",
    coldStartStrategy="drop"
)

model = als.fit(training)

# Evaluate
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}") # jak interpretować wartość

[Stage 266:>                                                      (0 + 12) / 13]

RMSE: 0.8065987840837545


                                                                                

In [8]:
# Top 10 recommendations for each user
user_recs = model.recommendForAllUsers(10)
user_recs.show(5, truncate=False)

# jakie filmy ogladal wczesniej a jakie dostal rekomendacje

# Top 10 recommendations for a specific user
user_42_recs = model.recommendForUserSubset(
    spark.createDataFrame([(42,)], ["user_id"]), 10
)

# Join with movie titles
from pyspark.sql.functions import explode

user_42_recs_flat = user_42_recs.select(
    "user_id",
    explode("recommendations").alias("rec")
).select("user_id", "rec.movie_id", "rec.rating")

user_42_recs_flat.join(movies, "movie_id").show(truncate=False)

                                                                                

+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                          |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28     |[{107673, 5.993949}, {92161, 5.754004}, {126090, 5.691018}, {89347, 5.6727877}, {80619, 5.590352}, {91517, 5.5237346}, {81104, 5.517514}, {101862, 5.498435}, {96030, 5.436287}, {6026, 5.429839}]       |
|31     |[{120821, 6.4272594}, {61913, 6.0270467}, {107780, 5.68834}, {115708, 5.4932604}, {27092, 5.4616733}, {112809, 5.435225}, {6511, 5.3223877}, {8

                                                                                

+--------+-------+---------+-------------------------------------------------------------+--------------------+
|movie_id|user_id|rating   |title                                                        |genres              |
+--------+-------+---------+-------------------------------------------------------------+--------------------+
|77736   |42     |5.725844 |Crazy Stone (Fengkuang de shitou) (2006)                     |Comedy|Crime        |
|26793   |42     |5.602836 |Tito and Me (Tito i ja) (1992)                               |Comedy              |
|129536  |42     |5.4690504|Code Name Coq Rouge (1989)                                   |(no genres listed)  |
|112577  |42     |5.329436 |Willie & Phil (1980)                                         |Comedy|Drama|Romance|
|106353  |42     |5.2808037|Holes in My Shoes (2006)                                     |Documentary         |
|112473  |42     |5.2644615|Stuart: A Life Backward (2007)                               |Drama         

In [9]:
from pyspark.sql.functions import explode, col

movie_recs = model.recommendForItemSubset(
    spark.createDataFrame([(1,)], ["movie_id"]), 10
)

# Check the schema
movie_recs.printSchema()

# Explode correctly - it's user_id, not movie_id
movie_recs_flat = movie_recs.select(
    col("movie_id"),
    explode("recommendations").alias("rec")
).select(
    "movie_id",
    col("rec.user_id").alias("user_id"),
    col("rec.rating").alias("score")
)

movie_recs_flat.show()

root
 |-- movie_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- user_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)

+--------+-------+---------+
|movie_id|user_id|    score|
+--------+-------+---------+
|       1|  59879| 5.589778|
|       1| 108993| 5.534154|
|       1|  27735| 5.501471|
|       1|  76693|5.4741354|
|       1| 119513|5.4539137|
|       1|  61498|5.4523168|
|       1| 122277| 5.439764|
|       1| 108389| 5.433884|
|       1|  30542|5.4208245|
|       1| 104672|5.3739924|
+--------+-------+---------+



In [10]:
from pyspark.ml.feature import Normalizer
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import Vectors
import numpy as np

# Get item factors
item_factors = model.itemFactors

# Get Toy Story's factor vector
toy_story_factor = item_factors.filter(col("id") == 1).select("features").collect()[0][0]

# Calculate cosine similarity
def cosine_sim(v):
    a = np.array(toy_story_factor)
    b = np.array(v)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

cosine_udf = udf(cosine_sim, DoubleType())

# Find similar movies
similar_movies = item_factors \
    .withColumn("similarity", cosine_udf(col("features"))) \
    .filter(col("id") != 1) \
    .orderBy(col("similarity").desc()) \
    .limit(10) \
    .withColumnRenamed("id", "movie_id")

# Join with movie titles
similar_movies.join(movies, "movie_id") \
    .select("movie_id", "title", "similarity") \
    .show(truncate=False)

+--------+--------------------------+------------------+
|movie_id|title                     |similarity        |
+--------+--------------------------+------------------+
|2355    |Bug's Life, A (1998)      |0.9882707793472743|
|3114    |Toy Story 2 (1999)        |0.9962658251924009|
|4886    |Monsters, Inc. (2001)     |0.984168171410898 |
|8827    |Bill Cosby, Himself (1983)|0.9889569138635115|
|8961    |Incredibles, The (2004)   |0.9841408640408783|
|48268   |Empire Falls (2005)       |0.9833830519464961|
|78499   |Toy Story 3 (2010)        |0.9861497811406748|
|81823   |Katalin Varga (2009)      |0.9871678567578743|
|102294  |Scapegoat, The (2012)     |0.9827218395442227|
|109423  |Your Friend the Rat (2007)|0.9829705384322079|
+--------+--------------------------+------------------+



                                                                                