In [1]:
# Importing model
from pyspark.ml.recommendation import ALS
# Importing evaluator
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Loading the data
df = spark.read.csv('/FileStore/tables/movielens_ratings.csv',
              inferSchema=True,
              header=True)

In [3]:
# Showing some data
df.show(5)

In [4]:
# Splitting data into train and test sets
(train_set, test_set) = df.randomSplit([0.8, 0.2])

In [5]:
# Creating model
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating').fit(train_set)

In [6]:
# Predicting results on test dataset
predictions = als.transform(test_set)

In [7]:
# Showing some results
predictions.show(10)

In [8]:
# Creating evaluator 
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [9]:
# Evaluating predictions
rmse = evaluator.evaluate(predictions)

In [10]:
print('RMSE: {}'.format(rmse))

In [11]:
# Filtering a single user
single_user = test_set.filter(test_set['userId']==11).select(['movieId', 'userId'])

In [12]:
# Showing movies this particular user rated
single_user.show(5)

In [13]:
# Recomending other movies (prediction)
recommendations = als.transform(single_user)

In [14]:
# Showing some results
recommendations.orderBy('prediction', ascending=False).show()

# The basic ideia is how likely is this user going to like movie titles based on another user's ratings and similar items. So, for example, it is likely that this user is going to like movieId 23. On the other hand, this same user probably won't like movieId 51.