In [1]:
#!pip install pyspark

In [2]:
#!pip install setuptools

In [3]:
# se usiamo master/worker locali non usare

# import findspark
# findspark.init()

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('RecommendationsBig') \
    .master('spark://192.168.56.1:7077') \
    .config('spark.driver.memory', '1g') \
    .config('spark.executor.instances', '2') \
    .config('spark.executor.memory', '3g') \
    .config("spark.locality.wait.node", "0") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:50054") \
    .getOrCreate()
    # .config('spark.executor.instances', '2') \

In [5]:
import pandas as pd
import numpy as np

ratings = spark.read.csv('hdfs://localhost:50054/test/ratings.csv', inferSchema=True, header=True)
# ratings = spark.read.csv('hdfs://localhost:50054/test/ratingsBig.csv', inferSchema=True, header=True)
ratings.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies = spark.read.csv('hdfs://localhost:50054/test/movies.csv', inferSchema=True, header=True)
# movies = spark.read.csv('hdfs://localhost:50054/test/moviesBig.csv', inferSchema=True, header=True)
movies.limit(5).toPandas()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
user_movie_ratings = ratings.join(movies, on='movieId', how='inner')
user_movie_ratings.limit(5).toPandas()

Unnamed: 0,movieId,userId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,3,1,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,6,1,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,47,1,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,50,1,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [8]:
user_movie_ratings.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [9]:
train, test = user_movie_ratings.randomSplit([0.8, 0.2], seed=40) # <- manca seed

In [10]:
from pyspark.ml.recommendation import ALS

rec = ALS(maxIter=10, 
          regParam=0.01, 
          userCol='userId', 
          itemCol='movieId', 
          ratingCol='rating', 
          nonnegative=True,
          coldStartStrategy='drop')

rec_model = rec.fit(train) # <- serve mettere collect sul train?
# 50s

In [11]:
predicted_ratings = rec_model.transform(test)
predicted_ratings.limit(5).toPandas()
# Tasks 10/236
# 26s

Unnamed: 0,movieId,userId,rating,timestamp,title,genres,prediction
0,296,463,4.0,1145460490,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.246272
1,527,463,4.0,1145460304,Schindler's List (1993),Drama|War,3.961171
2,3668,496,3.0,1415165655,Romeo and Juliet (1968),Drama|Romance,3.542641
3,4191,496,2.5,1415166210,Alfie (1966),Comedy|Drama|Romance,2.626679
4,4223,463,4.0,1145460361,Enemy at the Gates (2001),Drama|War,3.600543


In [12]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse',
                                predictionCol='prediction',
                                labelCol='rating')

rmse = evaluator.evaluate(predicted_ratings)
print(rmse)
# Tasks 10/236
# 18s

1.0278105937720876


In [13]:
# user_movie_ratings = user_movie_ratings.withColumn("userId", user_movie_ratings["userId"].cast("int"))
# user_movie_ratings = user_movie_ratings.repartition("userId")

In [14]:
# user = 599
# n = 10

# # print(user_movie_ratings.count(), len(user_movie_ratings.columns)) # -> 33832162 6
# user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user)
# print(user_movies.count(), len(user_movies.columns))
# # print(type(user_movies)) # -> <class 'pyspark.sql.dataframe.DataFrame'>

# # user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user).collect()
# # print(type(user_movies)) -> <class 'list'>
# print(user_movies.select('movieId').count())
# print(user_movies.select('movieId').distinct())
# print(f'User {user} has rated {user_movies.select('movieId').distinct().count()} movies') # <- is not counting all the movies rated by the user
# recommendations = rec_model.transform(user_movies)
# recommendations = recommendations.sort('prediction', ascending=False)
# recommendations = recommendations.join(movies, on='movieId', how='inner')
# recommendations.limit(n).toPandas()

In [15]:
def test_ALS(user, n):
    # print(user_movie_ratings.count(), len(user_movie_ratings.columns)) # -> 33832162 6
    user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user) # <- questo filtraggio non funziona correttamente con il dataset grande
    # capire se dipende da esecutori, parallelizzazione, boh altro
    print(user_movies.count(), len(user_movies.columns))
    # print(type(user_movies)) # -> <class 'pyspark.sql.dataframe.DataFrame'>
    
    # user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user).collect()
    # print(type(user_movies)) -> <class 'list'>
    
    print(f'User {user} has rated {user_movies.select('movieId').distinct().count()} movies') # <- is not counting all the movies rated by the user
    recommendations = rec_model.transform(user_movies)
    recommendations = recommendations.sort('prediction', ascending=False)
    recommendations = recommendations.join(movies, on='movieId', how='inner')
    return recommendations.limit(n).toPandas()

In [16]:
user = 599
n = 10
test_ALS(user, n)

2478 6
User 599 has rated 2478 movies


Unnamed: 0,movieId,userId,rating,timestamp,title,genres,prediction,title.1,genres.1
0,471,599,2.5,1498518822,"Hudsucker Proxy, The (1994)",Comedy,2.546143,"Hudsucker Proxy, The (1994)",Comedy
1,833,599,1.5,1519330029,High School High (1996),Comedy,1.48271,High School High (1996),Comedy
2,1088,599,2.5,1498515232,Dirty Dancing (1987),Drama|Musical|Romance,2.055951,Dirty Dancing (1987),Drama|Musical|Romance
3,1342,599,2.5,1498516773,Candyman (1992),Horror|Thriller,2.855042,Candyman (1992),Horror|Thriller
4,1580,599,3.0,1498525900,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi,3.107757,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi
5,1591,599,2.5,1498514765,Spawn (1997),Action|Adventure|Sci-Fi|Thriller,2.179976,Spawn (1997),Action|Adventure|Sci-Fi|Thriller
6,1645,599,3.0,1498525684,The Devil's Advocate (1997),Drama|Mystery|Thriller,2.31422,The Devil's Advocate (1997),Drama|Mystery|Thriller
7,1959,599,3.0,1519139634,Out of Africa (1985),Drama|Romance,3.258852,Out of Africa (1985),Drama|Romance
8,2366,599,3.0,1519122035,King Kong (1933),Action|Adventure|Fantasy|Horror,3.625356,King Kong (1933),Action|Adventure|Fantasy|Horror
9,3175,599,3.0,1498522138,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi,3.313814,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi


In [17]:
from pyspark.sql.functions import lit

# Recommend to the user movies he has not rated yet
def recommend(user, n):
    
    # Create a dataframe with all the movies not rated by the user
    user_movies = user_movie_ratings.filter(user_movie_ratings.userId == user)
    user_rated_movies = user_movies.select('movieId').distinct()
    all_movies = user_movie_ratings.select('movieId').distinct()
    movies_to_recommend = all_movies.subtract(user_rated_movies)
    
    # Create a dataframe with the user and the movies to recommend
    user_to_recommend = movies_to_recommend.withColumn('userId', lit(user))
    print(f'User {user} has not seen {user_to_recommend.select("movieId").distinct().count()} movies')

    # Get the recommendations
    recommendations = rec_model.transform(user_to_recommend)
    # print(type(recommendations))

    # Join the recommendations with the movies dataframe
    recommendations = recommendations.join(movies, on='movieId', how='inner')

    # Sort the recommendations by the highest prediction
    recommendations = recommendations.sort('prediction', ascending=False)
    # recommendations = recommendations.orderBy('prediction', ascending=False)

    return recommendations.limit(n).toPandas()

In [18]:
recommend(user, n)

User 599 has not seen 7246 movies


Unnamed: 0,movieId,userId,prediction,title,genres
0,2843,599,4.812122,"Black Cat, White Cat (Crna macka, beli macor) ...",Comedy|Romance
1,306,599,4.639181,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,3451,599,4.627817,Guess Who's Coming to Dinner (1967),Drama
3,905,599,4.390348,It Happened One Night (1934),Comedy|Romance
4,945,599,4.205558,Top Hat (1935),Comedy|Musical|Romance
5,7063,599,4.183002,"Aguirre: The Wrath of God (Aguirre, der Zorn G...",Adventure|Drama
6,1683,599,4.172557,"Wings of the Dove, The (1997)",Drama|Romance
7,3083,599,4.167095,All About My Mother (Todo sobre mi madre) (1999),Drama
8,3983,599,4.162676,You Can Count on Me (2000),Drama|Romance
9,417,599,4.15489,Barcelona (1994),Comedy|Romance


In [19]:
spark.stop()