In [140]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("MovieRatings").getOrCreate()

In [141]:
# read csv
movie_ratings = spark.read.csv("../data/ml-100k/u.data", sep="\t", inferSchema=True)
# we only care about 2nd and 3rd column
movie_ratings = movie_ratings.select(["_c0", "_c1", "_c2"])
# descriptive column names
column_names = ["user_id", "movie_id", "rating"]
movie_ratings = movie_ratings.toDF(*column_names)

movie_ratings.show(5)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|    196|     242|     3|
|    186|     302|     3|
|     22|     377|     1|
|    244|      51|     2|
|    166|     346|     1|
+-------+--------+------+
only showing top 5 rows



In [142]:
target_movie_id = 1

# find all users who rated the movie
user_rows = (
    movie_ratings.filter(movie_ratings["movie_id"] == target_movie_id)
    .select("user_id")
    .collect()
)
users = [row["user_id"] for row in user_rows]

# the target ratings of a given user is the rating he gave to the target movie
target_ratings = (
    movie_ratings.filter(movie_ratings["movie_id"] == target_movie_id)
    .select(["user_id", "rating"])
    .withColumnRenamed("rating", "target_rating")
)

# df with the ratings given by the users who watched the target movie
users_ratings = movie_ratings.filter(movie_ratings["user_id"].isin(users))

# add target rating to this dataset
users_ratings = users_ratings.join(target_ratings, "user_id", "inner")

users_ratings.show(5)

+-------+--------+------+-------------+
|user_id|movie_id|rating|target_rating|
+-------+--------+------+-------------+
|    244|      51|     2|            4|
|    298|     474|     4|            5|
|    253|     465|     5|            5|
|    305|     451|     3|            5|
|      6|      86|     3|            4|
+-------+--------+------+-------------+
only showing top 5 rows



In [143]:
def rounded_rmse(col1, col2):
    mse = F.mean((col1 - col2) ** 2)
    rmse = F.sqrt(mse)
    return F.round(rmse, 3)


def rounded_cosine_sim(col1, col2):
    dot_product = F.sum(col1 * col2)
    norm_1 = F.sqrt(F.sum(col1**2))
    norm_2 = F.sqrt(F.sum(col2**2))
    cosine_sim = dot_product / (norm_1 * norm_2)
    return F.round(cosine_sim, 3)


recommendations = users_ratings.groupBy("movie_id").agg(
    rounded_cosine_sim(users_ratings["rating"], users_ratings["target_rating"]).alias(
        "cosine_sim"
    ),
    # (rounded_rmse(users_ratings["rating"], users_ratings["target_rating"])).alias("rmse"),
    F.count(users_ratings["rating"]).alias("num_of_ratings"),
)

recommendations = recommendations.filter(recommendations["num_of_ratings"] >= 20)

recommendations.sort("cosine_sim", ascending=False).show(5)

+--------+----------+--------------+
|movie_id|cosine_sim|num_of_ratings|
+--------+----------+--------------+
|       1|       1.0|           452|
|     923|      0.98|            27|
|     612|     0.979|            26|
|     921|     0.979|            26|
|     500|     0.978|            24|
+--------+----------+--------------+
only showing top 5 rows



In [144]:
# read csv
movie_info = spark.read.csv("../data/ml-100k/u.item", sep="|", inferSchema=True)
# we only care about 2nd and 3rd column
movie_info = movie_info.select(["_c0", "_c1"])
# The column '_c1' contains movie name and date, we only want the name
movie_info = movie_info.withColumn("_c1", F.split("_c1", pattern=" \(")[0])
# descriptive column names
column_names = ["movie_id", "movie_name"]
movie_info = movie_info.toDF(*column_names)

movie_info.show(5)

+--------+----------+
|movie_id|movie_name|
+--------+----------+
|       1| Toy Story|
|       2| GoldenEye|
|       3|Four Rooms|
|       4|Get Shorty|
|       5|   Copycat|
+--------+----------+
only showing top 5 rows



In [145]:
# join movie_info to movie_ratings so that movie_ratings now displays the movie name
named_recommendations = recommendations.join(movie_info, "movie_id", "left")
named_recommendations.sort("cosine_sim", ascending=False).show(10)

+--------+----------+--------------+--------------------+
|movie_id|cosine_sim|num_of_ratings|          movie_name|
+--------+----------+--------------+--------------------+
|       1|       1.0|           452|           Toy Story|
|     923|      0.98|            27|Raise the Red Lan...|
|     921|     0.979|            26|Farewell My Concu...|
|     612|     0.979|            26|        Lost Horizon|
|     500|     0.978|            24|       Fly Away Home|
|     524|     0.978|            26| Great Dictator, The|
|     499|     0.977|            32|Cat on a Hot Tin ...|
|    1051|     0.976|            32|      Associate, The|
|    1039|     0.975|            67|              Hamlet|
|     489|     0.975|            31|           Notorious|
+--------+----------+--------------+--------------------+
only showing top 10 rows



In [146]:
spark.stop()