In [1]:
#!pip install pyspark

In [2]:
#!pip install setuptools

In [3]:
# se usiamo master/worker locali non usare

# import findspark
# findspark.init()

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('RecommendationsBig') \
    .master('spark://192.168.56.1:7077') \
    .config('spark.driver.memory', '2g') \
    .config('spark.executor.instances', '2') \
    .config('spark.executor.memory', '3g') \
    .config("spark.locality.wait.node", "0") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:50054") \
    .getOrCreate()
    # .config('spark.executor.instances', '2') \

In [2]:
import pandas as pd
import numpy as np

# ratings = spark.read.csv('hdfs://localhost:50054/test/ratings.csv', inferSchema=True, header=True)
ratings = spark.read.csv('hdfs://localhost:50054/test/ratingsBig.csv', inferSchema=True, header=True)
ratings.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [3]:
# movies = spark.read.csv('hdfs://localhost:50054/test/movies.csv', inferSchema=True, header=True)
movies = spark.read.csv('hdfs://localhost:50054/test/moviesBig.csv', inferSchema=True, header=True)
movies.limit(5).toPandas()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
user_movie_ratings = ratings.join(movies, on='movieId', how='inner')
user_movie_ratings.limit(5).toPandas()

Unnamed: 0,movieId,userId,rating,timestamp,title,genres
0,1,1,4.0,1225734739,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,110,1,4.0,1225865086,Braveheart (1995),Action|Drama|War
2,158,1,4.0,1225733503,Casper (1995),Adventure|Children
3,260,1,4.5,1225735204,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
4,356,1,5.0,1225735119,Forrest Gump (1994),Comedy|Drama|Romance|War


In [5]:
user_movie_ratings.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [6]:
# Optimizing the data
ratings = 1
# From the ALS documentation, genres and timestamps do not have an impact on the recommendations
user_movie_ratings = user_movie_ratings.drop('genres')
user_movie_ratings = user_movie_ratings.drop('timestamp')
# user_movie_ratings = user_movie_ratings.repartition(20).cache()

In [17]:
# Print out all the rows of a certain user
test_user = 2500
user_movie_ratings.filter(user_movie_ratings.userId == test_user).show()
print(user_movie_ratings.filter(user_movie_ratings.userId == test_user).collect())
print(len(user_movie_ratings.filter(user_movie_ratings.userId == test_user).collect()))

+-------+------+------+--------------------+
|movieId|userId|rating|               title|
+-------+------+------+--------------------+
|      5|  2500|   3.0|Father of the Bri...|
|     50|  2500|   5.0|Usual Suspects, T...|
|    145|  2500|   5.0|     Bad Boys (1995)|
|    196|  2500|   4.0|      Species (1995)|
|    260|  2500|   5.0|Star Wars: Episod...|
|    318|  2500|   5.0|Shawshank Redempt...|
|    333|  2500|   5.0|    Tommy Boy (1995)|
|    527|  2500|   4.0|Schindler's List ...|
|    593|  2500|   5.0|Silence of the La...|
|    858|  2500|   5.0|Godfather, The (1...|
|    912|  2500|   5.0|   Casablanca (1942)|
|   1193|  2500|   4.0|One Flew Over the...|
|   1196|  2500|   5.0|Star Wars: Episod...|
|   1213|  2500|   5.0|   Goodfellas (1990)|
|   1221|  2500|   5.0|Godfather: Part I...|
|   1302|  2500|   5.0|Field of Dreams (...|
|   1918|  2500|   4.0|Lethal Weapon 4 (...|
|   1954|  2500|   5.0|        Rocky (1976)|
|   2001|  2500|   4.0|Lethal Weapon 2 (...|
|   2003| 

In [7]:
train, test = user_movie_ratings.randomSplit([0.8, 0.2], seed=40) # <- manca seed

In [8]:
from pyspark.ml.recommendation import ALS

rec = ALS(maxIter=10, 
          regParam=0.01, 
          userCol='userId', 
          itemCol='movieId', 
          ratingCol='rating', 
          nonnegative=True,
          coldStartStrategy='drop')

rec_model = rec.fit(train)
# 50s

In [9]:
predicted_ratings = rec_model.transform(test)
predicted_ratings.limit(5).toPandas()
# Tasks 10/236
# 26s

Unnamed: 0,movieId,userId,rating,title,prediction
0,135,28,1.0,Down Periscope (1996),1.35368
1,593,28,4.0,"Silence of the Lambs, The (1991)",3.236991
2,3033,28,3.0,Spaceballs (1987),2.201962
3,3516,28,3.0,"Bell, Book and Candle (1958)",4.537934
4,6,31,2.0,Heat (1995),2.934491


In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse',
                                predictionCol='prediction',
                                labelCol='rating')

rmse = evaluator.evaluate(predicted_ratings)
print(rmse)
# Tasks 10/236
# 18s

0.8195305709045241


In [13]:
# user_movie_ratings = user_movie_ratings.withColumn("userId", user_movie_ratings["userId"].cast("int"))
# user_movie_ratings = user_movie_ratings.repartition("userId")

In [11]:
def test_ALS(user, n):
    # print(user_movie_ratings.count(), len(user_movie_ratings.columns)) # -> 33832162 6
    user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user) # <- questo filtraggio non funziona correttamente con il dataset grande
    # capire se dipende da esecutori, parallelizzazione, boh altro
    
    # print(user_movies.count(), len(user_movies.columns)) # <- per utente 599 sono 36, anche se dovrebbero essere almeno 800
    # print(type(user_movies)) # -> <class 'pyspark.sql.dataframe.DataFrame'>
    
    print(f'User {user} has rated {user_movies.select('movieId').distinct().count()} movies') # <- is not counting all the movies rated by the user
    recommendations = rec_model.transform(user_movies)
    recommendations = recommendations.sort('prediction', ascending=False)
    recommendations = recommendations.join(movies, on='movieId', how='inner')
    return recommendations.limit(n).toPandas()

In [18]:
user = 600
n = 10
test_ALS(user, n)
# 14s

57 4
User 600 has rated 57 movies


Unnamed: 0,movieId,userId,rating,title,prediction,title.1,genres
0,110,600,3.0,Braveheart (1995),3.317973,Braveheart (1995),Action|Drama|War
1,260,600,2.0,Star Wars: Episode IV - A New Hope (1977),3.353302,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
2,318,600,5.0,"Shawshank Redemption, The (1994)",4.105093,"Shawshank Redemption, The (1994)",Crime|Drama
3,527,600,3.0,Schindler's List (1993),4.031648,Schindler's List (1993),Drama|War
4,1036,600,5.0,Die Hard (1988),3.687278,Die Hard (1988),Action|Crime|Thriller
5,1196,600,2.0,Star Wars: Episode V - The Empire Strikes Back...,3.391547,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
6,1198,600,5.0,Raiders of the Lost Ark (Indiana Jones and the...,3.684068,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
7,1210,600,1.5,Star Wars: Episode VI - Return of the Jedi (1983),3.501064,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
8,1259,600,2.0,Stand by Me (1986),3.447681,Stand by Me (1986),Adventure|Drama
9,1291,600,5.0,Indiana Jones and the Last Crusade (1989),3.513201,Indiana Jones and the Last Crusade (1989),Action|Adventure


In [19]:
from pyspark.sql.functions import lit

# Recommend to the user movies he has not rated yet
def recommend(user, n):
    
    # Create a dataframe with all the movies not rated by the user
    user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user)
    user_rated_movies = user_movies.select('movieId').distinct()
    all_movies = user_movie_ratings.select('movieId').distinct()
    movies_to_recommend = all_movies.subtract(user_rated_movies)
    
    # Create a dataframe with the user and the movies to recommend
    user_to_recommend = movies_to_recommend.withColumn('userId', lit(user))
    print(f'User {user} has not seen {user_to_recommend.select("movieId").distinct().count()} movies')

    # Get the recommendations
    recommendations = rec_model.transform(user_to_recommend)
    # print(type(recommendations))

    # Join the recommendations with the movies dataframe
    recommendations = recommendations.join(movies, on='movieId', how='inner')

    # Sort the recommendations by the highest prediction
    recommendations = recommendations.sort('prediction', ascending=False)
    # recommendations = recommendations.orderBy('prediction', ascending=False)

    return recommendations.limit(n).toPandas()

In [None]:
recommend(user, n)
#22s

User 600 has not seen 83182 movies


Unnamed: 0,movieId,userId,prediction,title,genres
0,229047,600,30.561968,Trial by Fire (2008),Action|Drama|Thriller
1,287013,600,25.975834,Elso,(no genres listed)
2,288811,600,21.435333,Food and Country (2023),Documentary
3,288805,600,19.291801,King Coal (2023),Documentary
4,152711,600,17.96475,Who Killed Chea Vichea? (2010),Documentary
5,189325,600,17.458689,The Change,Drama
6,288803,600,17.148268,Firebrand (2023),Drama
7,288807,600,17.148268,Monster (2023),Drama
8,182759,600,17.133924,Sleep Has Her House (2017),Mystery
9,280052,600,16.320442,Prince (2022),Comedy


In [21]:
spark.stop()