In [1]:
import os
os.environ['PYSPARK_PYTHON']        = '/usr/local/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/local/bin/python3'
os.environ['JAVA_HOME']             = '/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home' # Java 10 is problematic

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setCheckpointDir('checkpoint/') # https://stackoverflow.com/a/31484461

In [14]:
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F

from helpers import make
# from helpers_my import make

In [6]:
ratings, movies = make('ratings', 'movies', spark=spark)
ratings.show(10)
movies.show(10)

+----+-----+------+
|user|movie|rating|
+----+-----+------+
|   1|    1|   4.0|
|   1|    3|   4.0|
|   1|    6|   4.0|
|   1|   47|   5.0|
|   1|   50|   5.0|
|   1|   70|   3.0|
|   1|  101|   5.0|
|   1|  110|   4.0|
|   1|  151|   5.0|
|   1|  157|   5.0|
+----+-----+------+
only showing top 10 rows

+-----+--------------------+--------------------+
|movie|               title|              genres|
+-----+--------------------+--------------------+
|    1|    Toy Story (1995)|Adventure|Animati...|
|    2|      Jumanji (1995)|Adventure|Childre...|
|    3|Grumpier Old Men ...|      Comedy|Romance|
|    4|Waiting to Exhale...|Comedy|Drama|Romance|
|    5|Father of the Bri...|              Comedy|
|    6|         Heat (1995)|Action|Crime|Thri...|
|    7|      Sabrina (1995)|      Comedy|Romance|
|    8| Tom and Huck (1995)|  Adventure|Children|
|    9| Sudden Death (1995)|              Action|
|   10|    GoldenEye (1995)|Action|Adventure|...|
+-----+--------------------+----------------

In [9]:
param_fixed = {
    'userCol'          : 'user'  , 
    'itemCol'          : 'movie' ,
    'ratingCol'        : 'rating',
    'coldStartStrategy': 'drop'  ,
    'nonnegative'      : True    ,
    'implicitPrefs'    : True    ,
    'maxIter'          : 20      ,
}

# some params: (see params tuning, CV in param-tuning.ipynb)
best_params = {'rank': 7, 'regParam': 0.18, 'alpha': 1.0}
als = ALS(**best_params, **param_fixed).fit(ratings)

In [74]:
user = 34

user_history = ratings[ratings.user == user] \
    .join(movies, on='movie')                \
    .orderBy('rating', ascending=False)

user_history.show(truncate=25)

+-----+----+------+-------------------------+-------------------------+
|movie|user|rating|                    title|                   genres|
+-----+----+------+-------------------------+-------------------------+
|   10|  34|   5.0|         GoldenEye (1995)|Action|Adventure|Thriller|
| 5349|  34|   5.0|        Spider-Man (2002)|Action|Adventure|Sci-F...|
| 3996|  34|   5.0|Crouching Tiger, Hidde...|     Action|Drama|Romance|
| 2273|  34|   5.0|         Rush Hour (1998)|Action|Comedy|Crime|Th...|
| 6874|  34|   5.0| Kill Bill: Vol. 1 (2003)|    Action|Crime|Thriller|
| 8636|  34|   5.0|      Spider-Man 2 (2004)|Action|Adventure|Sci-F...|
|27801|  34|   5.0|Ong-Bak: The Thai Warr...|          Action|Thriller|
|45499|  34|   5.0|X-Men: The Last Stand ...|   Action|Sci-Fi|Thriller|
| 6333|  34|   5.0|  X2: X-Men United (2003)|Action|Adventure|Sci-F...|
| 6539|  34|   5.0|Pirates of the Caribbe...|Action|Adventure|Comed...|
| 3793|  34|   5.0|             X-Men (2000)|  Action|Adventure|

In [68]:
def recommend(als_model, user, user_history, n_movies=10):
    
    # # recommendForAllUsers API
    # r = als.recommendForAllUsers(numItems=30)
    # r = r[r.user == user]
    
    # recommendForUserSubset API (pyspark 2.3+)

        dataset = spark.createDataFrame(
            data   = [[user]], 
            schema = ['user']
        ), 
        numItems = n_movies
    )

    r_exp = r.withColumn('recommendations', F.explode(r.recommendations))
    return r_exp.select(
        r_exp.user,
        r_exp.recommendations.movie.alias('movie'),
        r_exp.recommendations.rating.alias('rating_prediction'),
    )

    useless_recs = user_recs \
        .join(user_history, on='movie') \
        .select(user_recs.user, user_recs.movie, user_recs.rating_prediction)
    return user_recs.subtract(useless_recs)

In [75]:
user_recs = recommend(als_model=als, user=user, user_history=user_history, n_movies=30)

In [76]:
user_recs.join(movies, on='movie').show(truncate=25)

+-----+----+-----------------+-------------------------+-------------------------+
|movie|user|rating_prediction|                    title|                   genres|
+-----+----+-----------------+-------------------------+-------------------------+
| 2571|  34|         0.715771|       Matrix, The (1999)|   Action|Sci-Fi|Thriller|
| 2028|  34|       0.64082247|Saving Private Ryan (1...|         Action|Drama|War|
| 1196|  34|       0.63664347|Star Wars: Episode V -...|  Action|Adventure|Sci-Fi|
|  260|  34|        0.6326851|Star Wars: Episode IV ...|  Action|Adventure|Sci-Fi|
| 1198|  34|        0.6230483|Raiders of the Lost Ar...|         Action|Adventure|
| 2959|  34|       0.62268955|        Fight Club (1999)|Action|Crime|Drama|Thr...|
| 4993|  34|       0.61654496|Lord of the Rings: The...|        Adventure|Fantasy|
| 3578|  34|       0.61596286|         Gladiator (2000)|   Action|Adventure|Drama|
| 1210|  34|        0.6075755|Star Wars: Episode VI ...|  Action|Adventure|Sci-Fi|
| 59