# Predicting Movie Ratings 

Define locations of files

In [1]:
import os
dbfs_dir = '/user/root/ml-20m'
ratings_filename = dbfs_dir + '/ratings.csv'
movies_filename = dbfs_dir + '/movies.csv'

# for windows
if os.path.sep != '/':
    ratings_filename = ratings_filename.replace('/', os.path.sep)
    movies_filename = movies_filename.replace('/', os.path.sep)

## Part 0: Preliminaries 

### CPU vs I/O tradeoff

In [2]:
from pyspark.sql.types import *

ratings_df_schema = StructType(
    [StructField('userId', IntegerType()),
     StructField('movieId', IntegerType()),
     StructField('rating', DoubleType())]
)

movies_df_schema = StructType(
    [StructField('ID', IntegerType()),
     StructField('title', StringType())]
)

### Load and Cache

In [3]:
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import *

raw_ratings_df = (sqlContext
                  .read.format('com.databricks.spark.csv')
                  .options(header=True, inferSchema=False)
                  .schema(ratings_df_schema)
                  .load(ratings_filename))
ratings_df = raw_ratings_df.drop('Timestamp')

raw_movies_df = (sqlContext
                 .read.format('com.databricks.spark.csv')
                 .options(header=True, inferSchema=False)
                 .schema(movies_df_schema)
                 .load(movies_filename))
movies_df = raw_movies_df.drop('Genres').withColumnRenamed('movieID', 'ID')

ratings_df.cache()
movies_df.cache()
assert ratings_df.is_cached
assert movies_df.is_cached
raw_ratings_count = raw_ratings_df.count()
ratings_count = ratings_df.count()
raw_movies_count = raw_movies_df.count()
movies_count = movies_df.count()

print('There are {0} ratings and {1} movies in the datasets'.format(ratings_count, movies_count))
print('Ratings: ')
ratings_df.show(3)
print('Movies: ')
movies_df.show(n=3, truncate=False)

assert raw_ratings_count == ratings_count
assert raw_movies_count == movies_count

There are 20000263 ratings and 27278 movies in the datasets
Ratings: 
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      2|   3.5|
|     1|     29|   3.5|
|     1|     32|   3.5|
+------+-------+------+
only showing top 3 rows

Movies: 
+---+-----------------------+
|ID |title                  |
+---+-----------------------+
|1  |Toy Story (1995)       |
|2  |Jumanji (1995)         |
|3  |Grumpier Old Men (1995)|
+---+-----------------------+
only showing top 3 rows



A quick verification of data.

In [4]:
assert ratings_count == 20000263
assert movies_count == 27278
assert movies_df.filter(movies_df['title'] == 'Toy Story (1995)').count() == 1
assert ratings_df.filter((ratings_df['userId'] == 6) & (ratings_df['movieId'] == 1) & (ratings_df['rating'] == 5.0)).count() == 1

## Part 1: Basic Recommendations
### (1a) Movies with Highest Average Ratings

In [5]:
from pyspark.sql import functions as F

movie_ids_with_avg_ratings_df = (ratings_df
                                 .groupBy('movieId')
                                 .agg(F.count(ratings_df['rating']).alias('count'), F.avg(ratings_df['rating']).alias('average')))
#movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average"))
print('movie_ids_with_avg_ratings_df:')
movie_ids_with_avg_ratings_df.show(n=3, truncate=False)

movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df, F.col('movieId') == F.col('ID'))
movie_ids_with_avg_ratings_df = movie_names_df.select(F.col('average'), F.col('title'), F.col('count'), F.col('movieId'))

print('movie_names_with_avg_ratings_df:')
movie_ids_with_avg_ratings_df.show(n=3, truncate=False)

movie_ids_with_avg_ratings_df:
+-------+-----+------------------+
|movieId|count|average           |
+-------+-----+------------------+
|3997   |2047 |2.0703468490473864|
|1580   |35580|3.55831928049466  |
|3918   |1246 |2.918940609951846 |
+-------+-----+------------------+
only showing top 3 rows

movie_names_with_avg_ratings_df:
+------------------+--------------------------------+-----+-------+
|average           |title                           |count|movieId|
+------------------+--------------------------------+-----+-------+
|2.0703468490473864|Dungeons & Dragons (2000)       |2047 |3997   |
|3.55831928049466  |Men in Black (a.k.a. MIB) (1997)|35580|1580   |
|2.918940609951846 |Hellbound: Hellraiser II (1988) |1246 |3918   |
+------------------+--------------------------------+-----+-------+
only showing top 3 rows



In [6]:
assert movie_ids_with_avg_ratings_df.count() == 26744
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('MovieId').take(3)
result = [(r['average'], r['title'], r['count'], r['movieId']) for r in movie_ids_with_ratings_take_ordered]
print(result)
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('average', 'title').take(3)
result = [(r['average'], r['title'], r['count'], r['movieId']) for r in movie_ids_with_ratings_take_ordered]
print(result)

[(3.921239561324077, u'Toy Story (1995)', 49695, 1), (3.2119768016904193, u'Jumanji (1995)', 22243, 2), (3.1510404397330194, u'Grumpier Old Men (1995)', 12735, 3)]
[(0.5, u'13 Fighting Men (1960)', 1, 109355), (0.5, u'20 Years After (2008)', 1, 131062), (0.5, u'3 Holiday Tails (Golden Christmas 2: The Second Tail, A) (2011)', 1, 111040)]


### (1b) Movies with Highest Average Ratings and at least 500 Reviews

In [7]:
movies_with_500_ratings_or_more = movie_ids_with_avg_ratings_df.where('count >= 500').orderBy('average',ascending=False)
print('Movies with 500 ratings or more:')
movies_with_500_ratings_or_more.show(truncate=False)

Movies with 500 ratings or more:
+------------------+---------------------------------------------------------------------------+-----+-------+
|average           |title                                                                      |count|movieId|
+------------------+---------------------------------------------------------------------------+-----+-------+
|4.446990499637029 |Shawshank Redemption, The (1994)                                           |63366|318    |
|4.364732196832306 |Godfather, The (1972)                                                      |41355|858    |
|4.334372207803259 |Usual Suspects, The (1995)                                                 |47006|50     |
|4.310175010988133 |Schindler's List (1993)                                                    |50054|527    |
|4.275640557704942 |Godfather: Part II, The (1974)                                             |27398|1221   |
|4.2741796572216   |Seven Samurai (Shichinin no samurai) (1954)                

## Part 2: Collaborative Filtering

### (2a) Creating a Training Set

In [8]:
seed = 1800009193L
(split_60_df, split_a_20_df, split_b_20_df) = ratings_df.randomSplit([0.6, 0.2, 0.2], seed)
training_df = split_60_df.cache()
validation_df = split_a_20_df.cache()
test_df = split_b_20_df.cache()

In [9]:
print('Training: {0}, validation: {1}, test: {2}'.format(training_df.count(), validation_df.count(), test_df.count()))

Training: 11998949, validation: 4003888, test: 3997426


In [10]:
training_df.show(3)
validation_df.show(3)
test_df.show(3)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      2|   3.5|
|     1|     29|   3.5|
|     1|     47|   3.5|
+------+-------+------+
only showing top 3 rows

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     32|   3.5|
|     1|    253|   4.0|
|     1|    293|   4.0|
+------+-------+------+
only showing top 3 rows

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    112|   3.5|
|     1|    151|   4.0|
|     1|    318|   4.0|
+------+-------+------+
only showing top 3 rows



### (2b) Alternating Least Squares

In [11]:
from pyspark.ml.recommendation import ALS
als = ALS()
als.setMaxIter(5) \
   .setSeed(seed) \
   .setRegParam(0.1) \
   .setUserCol('userId') \
   .setItemCol('movieId') \
   .setRatingCol('rating')
    
from pyspark.ml.evaluation import RegressionEvaluator

reg_eval = RegressionEvaluator(predictionCol='prediction', labelCol='rating', metricName='rmse')
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
    als.setRank(rank)
    model = als.fit(training_df)
    predict_df = model.transform(validation_df)
    predicted_ratings_df = predict_df.filter(predict_df['prediction'] != float('nan'))
    error = reg_eval.evaluate(predicted_ratings_df)
    errors[err] = error
    models[err] = model
    print('For rank {0} the RMSE is {1}'.format(rank, error))
    if error < min_error:
        min_error = error
        best_rank = err
    err += 1
als.setRank(ranks[best_rank])
print('The best model was trained with rank {0}'.format(ranks[best_rank]))
my_model = models[best_rank]

For rank 4 the RMSE is 0.828345386773
For rank 8 the RMSE is 0.816184007672
For rank 12 the RMSE is 0.809510159004
The best model was trained with rank 12


### (2c) Testing Your Model

In [12]:
predict_df = my_model.transform(test_df)
predicted_test_df = predict_df.filter(predict_df['prediction'] != float('nan'))
test_RMSE = reg_eval.evaluate(predicted_test_df)
print('The model had a RMSE on the test set of {0}'.format(test_RMSE))

The model had a RMSE on the test set of 0.809435932161


### (2d) Comparing Your Model

In [13]:
avg_rating_df = training_df.groupBy().avg('rating')
training_avg_rating = avg_rating_df.collect()[0][0]
print('The average rating for movies in the training set is {0}'.format(training_avg_rating))
test_for_avg_df = test_df.withColumn('prediction', F.lit(training_avg_rating))
test_avg_RMSE = reg_eval.evaluate(test_for_avg_df)
print('The RMSE on the average set is {0}'.format(test_avg_RMSE))

The average rating for movies in the training set is 3.52569066674
The RMSE on the average set is 1.05264453981


## Part 3: Predictions for Yourself

### (3a) Your Movie Ratings

In [14]:
print('Most rated movies')
print('average rating, movie name, number of reviews, movie ID')
movies_with_500_ratings_or_more.show(100, truncate=False)

Most rated movies
average rating, movie name, number of reviews, movie ID
+------------------+----------------------------------------------------------------------------------------------------+-----+-------+
|average           |title                                                                                               |count|movieId|
+------------------+----------------------------------------------------------------------------------------------------+-----+-------+
|4.446990499637029 |Shawshank Redemption, The (1994)                                                                    |63366|318    |
|4.364732196832306 |Godfather, The (1972)                                                                               |41355|858    |
|4.334372207803259 |Usual Suspects, The (1995)                                                                          |47006|50     |
|4.310175010988133 |Schindler's List (1993)                                                                   

In [15]:
from pyspark.sql import Row
my_user_id = 0
my_rated_movies = [
    (my_user_id, 858, 5),
    (my_user_id, 1221, 5),
    (my_user_id, 7502, 5),
    (my_user_id, 2571, 5),
    (my_user_id, 79132, 5),
    (my_user_id, 7153, 5),
    (my_user_id, 318, 5),
    (my_user_id, 527, 5),
    (my_user_id, 912, 5),
    (my_user_id, 6016, 5)
]

my_ratings_df = sqlContext.createDataFrame(my_rated_movies, ['userId', 'movieId', 'rating'])
print('My movie ratings:')
my_ratings_df.show(10)

My movie ratings:
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     0|    858|     5|
|     0|   1221|     5|
|     0|   7502|     5|
|     0|   2571|     5|
|     0|  79132|     5|
|     0|   7153|     5|
|     0|    318|     5|
|     0|    527|     5|
|     0|    912|     5|
|     0|   6016|     5|
+------+-------+------+



### (3b) Add Your Movies to Training Dataset

In [16]:
training_with_my_ratings_df = training_df.unionAll(my_ratings_df)
print('The training dataset now has {0} more entries than the original training dataset'.format(training_with_my_ratings_df.count()-training_df.count()))
assert (training_with_my_ratings_df.count()-training_df.count() == my_ratings_df.count())

The training dataset now has 10 more entries than the original training dataset


### (3c) Train a Model with Your Ratings

In [17]:
als.setPredictionCol('prediction') \
   .setMaxIter(5) \
   .setSeed(seed) \
   .setUserCol('userId') \
   .setItemCol('movieId') \
   .setRatingCol('rating')
    
my_ratings_model = als.fit(training_with_my_ratings_df)

### (3d) Check RMSE for the New Model with Your Ratings

In [18]:
my_predict_df = my_ratings_model.transform(test_df)
predicted_test_my_ratings_df = my_predict_df.filter(my_predict_df['prediction'] != float('nan'))
test_RMSE_my_ratings = reg_eval.evaluate(predicted_test_my_ratings_df)
print('The model had a RMSE on the test set of {0}'.format(test_RMSE_my_ratings))

The model had a RMSE on the test set of 0.810968800507


### (3e) Predict Your Ratings

In [19]:
my_rated_movie_ids = [x[1] for x in my_rated_movies]
not_rated_df = movies_df.filter(~ F.col('ID').isin(my_rated_movie_ids))
my_unrated_movies_df = not_rated_df.withColumnRenamed('ID', 'movieId').withColumn('userId', F.lit(my_user_id))
raw_predicted_ratings_df = my_ratings_model.transform(my_unrated_movies_df)
predicted_ratings_df = raw_predicted_ratings_df.filter(raw_predicted_ratings_df['prediction'] != float('nan'))
predicted_with_counts_df = predicted_ratings_df.join(movies_df, F.col('movieId')==F.col('ID'))
predicted_highest_rated_movies_df = predicted_with_counts_df.sort(F.col('prediction').desc())
print('My 25 highest rated movies as predicted (for movies with more than 75 reviews:)')
predicted_highest_rated_movies_df.show(25,False)

My 25 highest rated movies as predicted (for movies with more than 75 reviews:)
+-------+-----------------------------------------------+------+----------+------+-----------------------------------------------+
|movieId|title                                          |userId|prediction|ID    |title                                          |
+-------+-----------------------------------------------+------+----------+------+-----------------------------------------------+
|77736  |Crazy Stone (Fengkuang de shitou) (2006)       |0     |6.0034633 |77736 |Crazy Stone (Fengkuang de shitou) (2006)       |
|121029 |No Distance Left to Run (2010)                 |0     |5.9715695 |121029|No Distance Left to Run (2010)                 |
|3226   |Hellhounds on My Trail (1999)                  |0     |5.9426756 |3226  |Hellhounds on My Trail (1999)                  |
|120134 |Doggiewoggiez! Poochiewoochiez! (2012)         |0     |5.8386703 |120134|Doggiewoggiez! Poochiewoochiez! (2012)         |
|13