In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setCheckpointDir('checkpoint/') # https://stackoverflow.com/a/31484461

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

In [2]:
!head -10 'ml-latest-small/ratings.csv'

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041


In [3]:
!head -10 'ml-latest-small/movies.csv'

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action


In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.sql.types import *
import pyspark.sql.functions as F


ratings_schema = StructType([
    StructField('user'     , IntegerType()),
    StructField('movie'    , IntegerType()),
    StructField('rating'   , DoubleType() ),
    StructField('timestamp', LongType()   ),
])

movies_schema = StructType([
    StructField('movie' , IntegerType()),
    StructField('title' , StringType ()),
    StructField('genres', StringType ()),
])


ratings = spark.read.csv('ml-latest-small/ratings.csv', header=True, schema=ratings_schema).select(['user' , 'movie', 'rating'])
movies  = spark.read.csv('ml-latest-small/movies.csv' , header=True, schema=movies_schema ).select(['movie', 'title', 'genres'])

ratings.show()
movies.show()

+----+-----+------+
|user|movie|rating|
+----+-----+------+
|   1|    1|   4.0|
|   1|    3|   4.0|
|   1|    6|   4.0|
|   1|   47|   5.0|
|   1|   50|   5.0|
|   1|   70|   3.0|
|   1|  101|   5.0|
|   1|  110|   4.0|
|   1|  151|   5.0|
|   1|  157|   5.0|
|   1|  163|   5.0|
|   1|  216|   5.0|
|   1|  223|   3.0|
|   1|  231|   5.0|
|   1|  235|   4.0|
|   1|  260|   5.0|
|   1|  296|   3.0|
|   1|  316|   3.0|
|   1|  333|   5.0|
|   1|  349|   4.0|
+----+-----+------+
only showing top 20 rows

+-----+--------------------+--------------------+
|movie|               title|              genres|
+-----+--------------------+--------------------+
|    1|    Toy Story (1995)|Adventure|Animati...|
|    2|      Jumanji (1995)|Adventure|Childre...|
|    3|Grumpier Old Men ...|      Comedy|Romance|
|    4|Waiting to Exhale...|Comedy|Drama|Romance|
|    5|Father of the Bri...|              Comedy|
|    6|         Heat (1995)|Action|Crime|Thri...|
|    7|      Sabrina (1995)|      Comedy|Rom

In [6]:
# params tuning, test, CV are in other file

param_fixed = {
    'userCol'          : 'user', 
    'itemCol'          : 'movie', 
    'ratingCol'        : 'rating', 
    'coldStartStrategy': 'drop', 
    'nonnegative'      : True,
}

best_params = {'rank': 7, 'maxIter': 20, 'regParam': 0.18}
als = ALS(**best_params, **param_fixed).fit(ratings)

In [7]:
def prettify_watched(user, movies_df, ratings_df):
    ur = ratings_df[ratings_df.user == user]
    return ur.join(movies_df, 'movie') \
        .select(
            ur.user,
            ur.rating,
            ur.movie,
            movies_df.title,
            movies_df.genres,
    ).orderBy('rating', ascending=False)



def prettify_recommendation(r, movies_df):
    r_explode = r.withColumn('recommendations', F.explode(r.recommendations))
    r1 = r_explode.select(
        r_explode.user,
        r_explode.recommendations.movie.alias('movie'),
        r_explode.recommendations.rating.alias('rating_prediction'),
    )
    return r1.join(movies_df, 'movie')
    

def recommend(model, user, movies_df, n_movies=10):
    return prettify_recommendation(
        model.recommendForUserSubset(
            dataset = spark.createDataFrame(
                data   = [[user]], 
                schema = ['user']
            ), 
            numItems = n_movies
        ),
        movies_df
    )

In [8]:
user = 34
prettify_watched(user=user, movies_df=movies, ratings_df=ratings).show(truncate=25)
recommend(model=als, user=user, n_movies=30, movies_df=movies).show(truncate=25)

+----+------+-----+-------------------------+-------------------------+
|user|rating|movie|                    title|                   genres|
+----+------+-----+-------------------------+-------------------------+
|  34|   5.0|   10|         GoldenEye (1995)|Action|Adventure|Thriller|
|  34|   5.0| 5349|        Spider-Man (2002)|Action|Adventure|Sci-F...|
|  34|   5.0| 3996|Crouching Tiger, Hidde...|     Action|Drama|Romance|
|  34|   5.0| 2273|         Rush Hour (1998)|Action|Comedy|Crime|Th...|
|  34|   5.0| 6874| Kill Bill: Vol. 1 (2003)|    Action|Crime|Thriller|
|  34|   5.0| 8636|      Spider-Man 2 (2004)|Action|Adventure|Sci-F...|
|  34|   5.0|27801|Ong-Bak: The Thai Warr...|          Action|Thriller|
|  34|   5.0|45499|X-Men: The Last Stand ...|   Action|Sci-Fi|Thriller|
|  34|   5.0| 6333|  X2: X-Men United (2003)|Action|Adventure|Sci-F...|
|  34|   5.0| 6539|Pirates of the Caribbe...|Action|Adventure|Comed...|
|  34|   5.0| 3793|             X-Men (2000)|  Action|Adventure|

In [None]:
#recommendForAllUsers(10)