In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("MovieRecommendations").getOrCreate()

In [4]:
# read csv
movie_ratings = spark.read.csv("../data/ml-100k/u.data", sep="\t", inferSchema=True)
# we only care about 2nd and 3rd column
movie_ratings = movie_ratings.select(["_c0", "_c1", "_c2"])
# descriptive column names
column_names = ["user", "item", "rating"]
movie_ratings = movie_ratings.toDF(*column_names)

movie_ratings.show(5)

+----+----+------+
|user|item|rating|
+----+----+------+
| 196| 242|     3|
| 186| 302|     3|
|  22| 377|     1|
| 244|  51|     2|
| 166| 346|     1|
+----+----+------+
only showing top 5 rows



In [5]:
from pyspark.ml.recommendation import ALS

als = ALS(seed=42)
model = als.fit(movie_ratings)

In [6]:
# recommend 1 movie to each user
user_recs = model.recommendForAllUsers(1)

In [7]:
# create more readable columns
user_recs = user_recs.withColumn(
    "movie_id", user_recs["recommendations"][0].item
).withColumn("predicted_rating", user_recs["recommendations"][0].rating)

# drop old column
user_recs = user_recs.drop("recommendations")

user_recs.show(5)



+----+--------+----------------+
|user|movie_id|predicted_rating|
+----+--------+----------------+
|   1|    1463|        5.122655|
|   2|    1643|       5.0821095|
|   3|    1463|        5.146367|
|   4|    1466|       5.5968733|
|   5|     114|       4.6203103|
+----+--------+----------------+
only showing top 5 rows



                                                                                

In [8]:
# read csv
movie_info = spark.read.csv("../data/ml-100k/u.item", sep="|", inferSchema=True)
# we only care about 2nd and 3rd column
movie_info = movie_info.select(["_c0", "_c1"])
# The column '_c1' contains movie name and date, we only want the name
movie_info = movie_info.withColumn("_c1", F.split("_c1", pattern=" \(")[0])
# descriptive column names
column_names = ["movie_id", "movie_name"]
movie_info = movie_info.toDF(*column_names)

movie_info.show(5)

+--------+----------+
|movie_id|movie_name|
+--------+----------+
|       1| Toy Story|
|       2| GoldenEye|
|       3|Four Rooms|
|       4|Get Shorty|
|       5|   Copycat|
+--------+----------+
only showing top 5 rows



In [9]:
# join movie_info to movie_ratings so that movie_ratings now displays the movie name
named_user_recs = user_recs.join(movie_info, "movie_id", "left")

# reorder columns
named_user_recs = named_user_recs.select(
    ["user", "movie_id", "movie_name", "predicted_rating"]
)

named_user_recs.show(5)

                                                                                

+----+--------+--------------------+----------------+
|user|movie_id|          movie_name|predicted_rating|
+----+--------+--------------------+----------------+
|   1|    1463|           Boys, Les|        5.122655|
|   2|    1643|          Angel Baby|       5.0821095|
|   3|    1463|           Boys, Les|        5.146367|
|   4|    1466|   Margaret's Museum|       5.5968733|
|   5|     114|Wallace & Gromit:...|       4.6203103|
+----+--------+--------------------+----------------+
only showing top 5 rows



To analyze the results it may be useful to manually create users with predictable
behavior.
For example, you can create a user 0 with your own personal preferences
and see if the model's recommendations are any good.


If you tried that, you could see that the recommendations are not very good.
Why? It's complicated, recommendation is a full ML problem by itself.
So basically, never blindly trust the results of a black box.


In [10]:
spark.stop()