In [None]:
#!pip install pyspark

In [None]:
#!pip install setuptools

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('Recommendations') \
    .master('local') \
    .config('spark.driver.memory', '2g') \
    .config('spark.executor.memory', '4g') \
    .getOrCreate()
    # .config('spark.executor.instances', '2') \

In [3]:
import pandas as pd
import numpy as np

ratings = spark.read.csv('hdfs://localhost:50054/test/ratings.csv', inferSchema=True, header=True)
ratings.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies = spark.read.csv('hdfs://localhost:50054/test/movies.csv', inferSchema=True, header=True)
movies.limit(5).toPandas()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
user_movie_ratings = ratings.join(movies, on='movieId', how='inner')
user_movie_ratings.limit(5).toPandas()

Unnamed: 0,movieId,userId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,3,1,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,6,1,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,47,1,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,50,1,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [7]:
user_movie_ratings.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [8]:
train, test = user_movie_ratings.randomSplit([0.8, 0.2])

In [9]:
from pyspark.ml.recommendation import ALS

rec = ALS(maxIter=10, 
          regParam=0.01, 
          userCol='userId', 
          itemCol='movieId', 
          ratingCol='rating', 
          nonnegative=True,
          coldStartStrategy='drop')

rec_model = rec.fit(train)

In [10]:
predicted_ratings = rec_model.transform(test)
predicted_ratings.limit(5).toPandas()

Unnamed: 0,movieId,userId,rating,timestamp,title,genres,prediction
0,296,463,4.0,1145460490,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.505833
1,527,471,4.5,1496671869,Schindler's List (1993),Drama|War,3.877167
2,720,496,1.0,1415165412,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,4.731792
3,858,496,5.0,1415166629,"Godfather, The (1972)",Crime|Drama,4.379451
4,1088,463,3.5,1145460096,Dirty Dancing (1987),Drama|Musical|Romance,3.416477


In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse',
                                predictionCol='prediction',
                                labelCol='rating')

rmse = evaluator.evaluate(predicted_ratings)
print(rmse)

1.016815227011378


In [23]:
def test_ALS(user, n):
    user = user
    user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user)
    print(f'User {user} has rated {user_movies.select('movieId').distinct().count()} movies')
    recommendations = rec_model.transform(user_movies)
    recommendations = recommendations.sort('prediction', ascending=False)
    recommendations = recommendations.join(movies, on='movieId', how='inner')
    return recommendations.limit(n).toPandas()

In [24]:
user = 599
n = 10
test_ALS(user, n)


User 599 has rated 2478 movies


Unnamed: 0,movieId,userId,rating,timestamp,title,genres,prediction,title.1,genres.1
0,1,599,3.0,1498524204,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.090849,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,599,2.5,1498514085,Jumanji (1995),Adventure|Children|Fantasy,2.739783,Jumanji (1995),Adventure|Children|Fantasy
2,3,599,1.5,1498505071,Grumpier Old Men (1995),Comedy|Romance,3.006966,Grumpier Old Men (1995),Comedy|Romance
3,6,599,4.5,1498539623,Heat (1995),Action|Crime|Thriller,3.61775,Heat (1995),Action|Crime|Thriller
4,7,599,2.5,1498514161,Sabrina (1995),Comedy|Romance,2.398053,Sabrina (1995),Comedy|Romance
5,9,599,1.5,1498504960,Sudden Death (1995),Action,1.63083,Sudden Death (1995),Action
6,10,599,3.5,1498500281,GoldenEye (1995),Action|Adventure|Thriller,3.024278,GoldenEye (1995),Action|Adventure|Thriller
7,11,599,2.5,1498516445,"American President, The (1995)",Comedy|Drama|Romance,2.746863,"American President, The (1995)",Comedy|Drama|Romance
8,12,599,1.5,1519181787,Dracula: Dead and Loving It (1995),Comedy|Horror,1.412271,Dracula: Dead and Loving It (1995),Comedy|Horror
9,15,599,1.5,1519239842,Cutthroat Island (1995),Action|Adventure|Romance,2.121464,Cutthroat Island (1995),Action|Adventure|Romance


In [58]:
from pyspark.sql.functions import lit

# Recommend to the user movies he has not rated yet
def recommend(user, n):
    
    # Create a dataframe with all the movies not rated by the user
    user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user)
    user_rated_movies = user_movies.select('movieId').distinct()
    all_movies = user_movie_ratings.select('movieId').distinct()
    movies_to_recommend = all_movies.subtract(user_rated_movies)
    
    # Create a dataframe with the user and the movies to recommend
    user_to_recommend = movies_to_recommend.withColumn('userId', lit(user))
    print(f'User {user} has not seen {user_to_recommend.select("movieId").distinct().count()} movies')

    # Get the recommendations
    recommendations = rec_model.transform(user_to_recommend)
    print(type(recommendations))

    # Join the recommendations with the movies dataframe
    recommendations = recommendations.join(movies, on='movieId', how='inner')

    # Sort the recommendations by the highest prediction
    recommendations = recommendations.sort('prediction', ascending=False)
    # recommendations = recommendations.orderBy('prediction', ascending=False)

    return recommendations.limit(n).toPandas()

In [59]:
recommend(user, n)

User 599 has not seen 7246 movies
<class 'pyspark.sql.dataframe.DataFrame'>


Unnamed: 0,movieId,userId,prediction,title,genres
0,955,599,4.450887,Bringing Up Baby (1938),Comedy|Romance
1,25771,599,4.426465,"Andalusian Dog, An (Chien andalou, Un) (1929)",Fantasy
2,7034,599,4.311224,Show Me Love (Fucking Åmål) (1998),Drama|Romance
3,2693,599,4.248898,Trekkies (1997),Documentary
4,3814,599,4.244041,Love and Death (1975),Comedy
5,176,599,4.224238,Living in Oblivion (1995),Comedy
6,6380,599,4.196019,Capturing the Friedmans (2003),Documentary
7,1241,599,4.148584,Dead Alive (Braindead) (1992),Comedy|Fantasy|Horror
8,1272,599,4.104945,Patton (1970),Drama|War
9,2132,599,4.07864,Who's Afraid of Virginia Woolf? (1966),Drama


In [60]:
spark.stop()

: 