In [22]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("MovieRatings").getOrCreate()

In [23]:
# read csv
movie_ratings = spark.read.csv("../data/ml-100k/u.data", sep="\t", inferSchema=True)
# we only care about 2nd and 3rd column
movie_ratings = movie_ratings.select(["_c1", "_c2"])
# descriptive column names
column_names = ["movie_id", "rating"]
movie_ratings = movie_ratings.toDF(*column_names)

movie_ratings.show(5)

+--------+------+
|movie_id|rating|
+--------+------+
|     242|     3|
|     302|     3|
|     377|     1|
|      51|     2|
|     346|     1|
+--------+------+
only showing top 5 rows



In [24]:
# read csv
movie_info = spark.read.csv("../data/ml-100k/u.item", sep="|", inferSchema=True)
# we only care about 2nd and 3rd column
movie_info = movie_info.select(["_c0", "_c1"])
# The column '_c1' contains movie name and date, we only want the name
movie_info = movie_info.withColumn("_c1", F.split("_c1", pattern=" \(")[0])
# descriptive column names
column_names = ["movie_id", "movie_name"]
movie_info = movie_info.toDF(*column_names)

movie_info.show(5)

+--------+----------+
|movie_id|movie_name|
+--------+----------+
|       1| Toy Story|
|       2| GoldenEye|
|       3|Four Rooms|
|       4|Get Shorty|
|       5|   Copycat|
+--------+----------+
only showing top 5 rows



In [25]:
# join movie_info to movie_ratings so that movie_ratings now displays the movie name
movie_ratings = movie_ratings.join(movie_info, "movie_id", "left")
movie_ratings.show(5)

+--------+------+-------------------+
|movie_id|rating|         movie_name|
+--------+------+-------------------+
|     242|     3|              Kolya|
|     302|     3|  L.A. Confidential|
|     377|     1|       Heavyweights|
|      51|     2|Legends of the Fall|
|     346|     1|       Jackie Brown|
+--------+------+-------------------+
only showing top 5 rows



In [26]:
rating_counts = movie_ratings.groupBy(["movie_id", "movie_name"]).agg(
    F.count("rating").alias("num_of_ratings")
)
ordered_rating_counts = rating_counts.orderBy("num_of_ratings", ascending=False)
ordered_rating_counts.show(10)

+--------+--------------------+--------------+
|movie_id|          movie_name|num_of_ratings|
+--------+--------------------+--------------+
|      50|           Star Wars|           583|
|     258|             Contact|           509|
|     100|               Fargo|           508|
|     181|  Return of the Jedi|           507|
|     294|           Liar Liar|           485|
|     286|English Patient, The|           481|
|     288|              Scream|           478|
|       1|           Toy Story|           452|
|     300|       Air Force One|           431|
|     121|    Independence Day|           429|
+--------+--------------------+--------------+
only showing top 10 rows



In [27]:
spark.stop()