## Movie Recommendation Engine

### Import Libraries and Start Spark Session

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("Week 11 Assignment").getOrCreate()

### a) Prepare Data

In [3]:
ratings = spark.read.format("csv").options(header="true", inferSchema="true").load("data/movielens/ratings.csv")

In [4]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [5]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [6]:
movies = spark.read.format("csv").options(header="true", inferSchema="true").load("data/movielens/movies.csv")

In [7]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [8]:
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [9]:
movies.write.saveAsTable("movies")

In [10]:
dataset = ratings.join(movies, "movieId")

In [11]:
dataset.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [12]:
dataset.show()

+-------+------+------+---------+--------------------+--------------------+
|movieId|userId|rating|timestamp|               title|              genres|
+-------+------+------+---------+--------------------+--------------------+
|      1|     1|   4.0|964982703|    Toy Story (1995)|Adventure|Animati...|
|      3|     1|   4.0|964981247|Grumpier Old Men ...|      Comedy|Romance|
|      6|     1|   4.0|964982224|         Heat (1995)|Action|Crime|Thri...|
|     47|     1|   5.0|964983815|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     50|     1|   5.0|964982931|Usual Suspects, T...|Crime|Mystery|Thr...|
|     70|     1|   3.0|964982400|From Dusk Till Da...|Action|Comedy|Hor...|
|    101|     1|   5.0|964980868|Bottle Rocket (1996)|Adventure|Comedy|...|
|    110|     1|   4.0|964982176|   Braveheart (1995)|    Action|Drama|War|
|    151|     1|   5.0|964984041|      Rob Roy (1995)|Action|Drama|Roma...|
|    157|     1|   5.0|964984100|Canadian Bacon (1...|          Comedy|War|
|    163|   

In [13]:
dataset.count()

100836

### b) Train Recommender

In [14]:
training, test = dataset.randomSplit([0.8, 0.2])

In [15]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

In [16]:
model = als.fit(training)

In [17]:
predictions = model.transform(test)

In [18]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [19]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1008608174442225


### c) Generate top 10 movie recommendations

In [20]:
userRecs = model.recommendForAllUsers(10)

In [21]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[80693, 9.948429...|
|   463|[[49274, 7.499418...|
|   496|[[33880, 7.319961...|
|   148|[[82378, 6.378358...|
|   540|[[7008, 7.315785]...|
|   392|[[3846, 10.095111...|
|   243|[[4721, 12.150444...|
|    31|[[1866, 8.642999]...|
|   516|[[1295, 9.3343725...|
|   580|[[54004, 7.087000...|
|   251|[[49274, 8.073708...|
|   451|[[28, 7.4293823],...|
|    85|[[4678, 7.9644423...|
|   137|[[1354, 5.757489]...|
|    65|[[49274, 7.577062...|
|   458|[[7318, 8.994628]...|
|   481|[[2290, 7.5823965...|
|    53|[[135861, 8.23651...|
|   255|[[45880, 10.21800...|
|   588|[[7318, 11.491583...|
+------+--------------------+
only showing top 20 rows



#### User 127 Reccomendations

In [22]:
user127 = userRecs[userRecs.userId == "127"]

In [23]:
user127.show(truncate=False)

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                         |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|127   |[[3980, 10.235895], [3646, 10.147677], [2135, 9.8488655], [1327, 9.6707115], [64575, 9.500293], [67923, 9.258942], [4735, 9.208395], [1354, 9.09548], [232, 9.001869], [1866, 8.577561]]|
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [24]:
user127_list = user127.select("recommendations").collect()[0][0]

In [25]:
print("Top 10 Movie Recommendations for User 127")
print("--------------------------------------------")
for movie in user127_list:
    movieTitle = spark.sql("SELECT title FROM movies WHERE movieId = {}".format(movie[0])).collect()
    print(movie[0], "-", movieTitle[0][0])

Top 10 Movie Recommendations for User 127
--------------------------------------------
3980 - Men of Honor (2000)
3646 - Big Momma's House (2000)
2135 - Doctor Dolittle (1967)
1327 - Amityville Horror, The (1979)
64575 - Doubt (2008)
67923 - Fast & Furious (Fast and the Furious 4, The) (2009)
4735 - Ghosts of Mars (2001)
1354 - Breaking the Waves (1996)
232 - Eat Drink Man Woman (Yin shi nan nu) (1994)
1866 - Big Hit, The (1998)


#### User 151 Reccomendations

In [26]:
user151 = userRecs[userRecs.userId == "151"]

In [27]:
user151.show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                              |
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|151   |[[1916, 7.7776775], [932, 6.9448237], [4356, 6.8649716], [2936, 6.8236713], [1096, 6.816572], [176371, 6.7171946], [1277, 6.6627345], [3424, 6.64536], [6818, 6.6003714], [71899, 6.5888443]]|
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [28]:
user151_list = user151.select("recommendations").collect()[0][0]

In [29]:
print("Top 10 Movie Recommendations for User 151")
print("--------------------------------------------")
for movie in user151_list:
    movieTitle = spark.sql("SELECT title FROM movies WHERE movieId = {}".format(movie[0])).collect()
    print(movie[0], "-", movieTitle[0][0])

Top 10 Movie Recommendations for User 151
--------------------------------------------
1916 - Buffalo '66 (a.k.a. Buffalo 66) (1998)
932 - Affair to Remember, An (1957)
4356 - Gentlemen Prefer Blondes (1953)
2936 - Sullivan's Travels (1941)
1096 - Sophie's Choice (1982)
176371 - Blade Runner 2049 (2017)
1277 - Cyrano de Bergerac (1990)
3424 - Do the Right Thing (1989)
6818 - Come and See (Idi i smotri) (1985)
71899 - Mary and Max (2009)


#### User 300 Reccomendations

In [30]:
user300 = userRecs[userRecs.userId == "300"]

In [31]:
user300.show(truncate=False)

+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                             |
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|300   |[[89118, 7.1090584], [932, 6.934544], [176371, 6.400304], [81591, 6.1858225], [4467, 6.160431], [6283, 6.1164017], [6659, 6.0193977], [1237, 6.0088854], [27156, 5.97076], [1945, 5.962268]]|
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [32]:
user300_list = user300.select("recommendations").collect()[0][0]

In [33]:
print("Top 10 Movie Recommendations for User 300")
print("--------------------------------------------")
for movie in user300_list:
    movieTitle = spark.sql("SELECT title FROM movies WHERE movieId = {}".format(movie[0])).collect()
    print(movie[0], "-", movieTitle[0][0])

Top 10 Movie Recommendations for User 300
--------------------------------------------
89118 - Skin I Live In, The (La piel que habito) (2011)
932 - Affair to Remember, An (1957)
176371 - Blade Runner 2049 (2017)
81591 - Black Swan (2010)
4467 - Adventures of Baron Munchausen, The (1988)
6283 - Cowboy Bebop: The Movie (Cowboy Bebop: Tengoku no Tobira) (2001)
6659 - Tremors (1990)
1237 - Seventh Seal, The (Sjunde inseglet, Det) (1957)
27156 - Neon Genesis Evangelion: The End of Evangelion (Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni) (1997)
1945 - On the Waterfront (1954)
