In [13]:
import os
os.environ['PYSPARK_PYTHON']        = '/usr/local/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/local/bin/python3'
os.environ['JAVA_HOME']             = '/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home' # Java 10 is problematic

In [14]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setCheckpointDir('checkpoint/') # https://stackoverflow.com/a/31484461

In [15]:
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F

In [16]:
from helpers import make
# from helpers_my import make

In [17]:
ratings, movies = make('ratings', 'movies', spark=spark)
ratings.show(10)
movies.show(10)

+----+-----+------+
|user|movie|rating|
+----+-----+------+
|   1|    1|   4.0|
|   1|    3|   4.0|
|   1|    6|   4.0|
|   1|   47|   5.0|
|   1|   50|   5.0|
|   1|   70|   3.0|
|   1|  101|   5.0|
|   1|  110|   4.0|
|   1|  151|   5.0|
|   1|  157|   5.0|
+----+-----+------+
only showing top 10 rows

+-----+--------------------+--------------------+
|movie|               title|              genres|
+-----+--------------------+--------------------+
|    1|    Toy Story (1995)|Adventure|Animati...|
|    2|      Jumanji (1995)|Adventure|Childre...|
|    3|Grumpier Old Men ...|      Comedy|Romance|
|    4|Waiting to Exhale...|Comedy|Drama|Romance|
|    5|Father of the Bri...|              Comedy|
|    6|         Heat (1995)|Action|Crime|Thri...|
|    7|      Sabrina (1995)|      Comedy|Romance|
|    8| Tom and Huck (1995)|  Adventure|Children|
|    9| Sudden Death (1995)|              Action|
|   10|    GoldenEye (1995)|Action|Adventure|...|
+-----+--------------------+----------------

In [18]:
param_fixed = {
    'userCol'          : 'user'  , 
    'itemCol'          : 'movie' ,
    'ratingCol'        : 'rating',
    'coldStartStrategy': 'drop'  ,
    'nonnegative'      : True    ,
    'implicitPrefs'    : True    ,
    'maxIter'          : 20      ,
}

# some params: (see params tuning, CV in param-tuning.ipynb)
best_params = {'rank': 7, 'regParam': 0.18, 'alpha': 1.0}
als = ALS(**best_params, **param_fixed).fit(ratings)

In [23]:
user = 34
# user = 2177929

user_history = ratings[ratings.user == user] \
    .join(movies, on='movie')                \
    .orderBy('rating', ascending=False)

user_history.show(truncate=25)

+-----+----+------+-------------------------+-------------------------+
|movie|user|rating|                    title|                   genres|
+-----+----+------+-------------------------+-------------------------+
|   10|  34|   5.0|         GoldenEye (1995)|Action|Adventure|Thriller|
| 5349|  34|   5.0|        Spider-Man (2002)|Action|Adventure|Sci-F...|
| 3996|  34|   5.0|Crouching Tiger, Hidde...|     Action|Drama|Romance|
| 2273|  34|   5.0|         Rush Hour (1998)|Action|Comedy|Crime|Th...|
| 6874|  34|   5.0| Kill Bill: Vol. 1 (2003)|    Action|Crime|Thriller|
| 8636|  34|   5.0|      Spider-Man 2 (2004)|Action|Adventure|Sci-F...|
|27801|  34|   5.0|Ong-Bak: The Thai Warr...|          Action|Thriller|
|45499|  34|   5.0|X-Men: The Last Stand ...|   Action|Sci-Fi|Thriller|
| 6333|  34|   5.0|  X2: X-Men United (2003)|Action|Adventure|Sci-F...|
| 6539|  34|   5.0|Pirates of the Caribbe...|Action|Adventure|Comed...|
| 3793|  34|   5.0|             X-Men (2000)|  Action|Adventure|

In [24]:
def recommend(als_model, user, user_history, n_movies=10):
    
    # # recommendForAllUsers API
    # r = als.recommendForAllUsers(numItems=30)
    # r = r[r.user == user]
    
    # recommendForUserSubset API (pyspark 2.3+)
    r = als_model.recommendForUserSubset(
        dataset = spark.createDataFrame(
            data   = [[user]], 
            schema = ['user']
        ), 
        numItems = n_movies
    )

    r_exp = r.withColumn('recommendations', F.explode(r.recommendations))
    return r_exp.select(
        r_exp.user,
        r_exp.recommendations.movie.alias('movie'),
        r_exp.recommendations.rating.alias('rating_prediction'),
    )

    useless_recs = user_recs \
        .join(user_history, on='movie') \
        .select(user_recs.user, user_recs.movie, user_recs.rating_prediction)
    return user_recs.subtract(useless_recs)

In [25]:
user_recs = recommend(als_model=als, user=user, user_history=user_history, n_movies=30)

In [26]:
user_recs.join(movies, on='movie').show(truncate=25)

+-----+----+-----------------+-------------------------+-------------------------+
|movie|user|rating_prediction|                    title|                   genres|
+-----+----+-----------------+-------------------------+-------------------------+
| 4993|  34|        0.5404797|Lord of the Rings: The...|        Adventure|Fantasy|
| 5445|  34|       0.53983843|   Minority Report (2002)|Action|Crime|Mystery|S...|
| 6874|  34|        0.5381136| Kill Bill: Vol. 1 (2003)|    Action|Crime|Thriller|
| 5952|  34|       0.53787386|Lord of the Rings: The...|        Adventure|Fantasy|
| 5418|  34|       0.53430873|Bourne Identity, The (...|  Action|Mystery|Thriller|
| 7153|  34|       0.53185725|Lord of the Rings: The...|Action|Adventure|Drama...|
| 6539|  34|        0.5316936|Pirates of the Caribbe...|Action|Adventure|Comed...|
| 5989|  34|        0.5278688|Catch Me If You Can (2...|              Crime|Drama|
| 4306|  34|       0.52497464|             Shrek (2001)|Adventure|Animation|Ch...|
| 29