Install Apache Spark:

$ pip install pyspark

Initialize spark session:

In [2]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext('local')
spark = SparkSession(sc)

File "sample_movielens_ratings.txt" contains rows with content:
userId::movieId::rating::timestamp

For 29::9::1::1424380312 example:
userId=29
movieId=9
rating=1
timestamp=1424380312

Read and parse dataset:

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

lines = spark.read.text("sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

In [7]:
#Split dataset to training and test:
(training, test) = ratings.randomSplit([0.8, 0.2])

Important features while using ALS:
- userCol - column with user id identifier
- itemCol - column with identifier of an object
- ratingCol - column of rating, this could be explicite rating or implicite (for example kind of behaviour), in this second case implicitPrefs=True should be use for better results
- coldStartStrategy - strategy for cold start problem, there are 2 solutions in Apache: drop - drop nan values, and nan - return nan values, other strategies are in development

In [8]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [9]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.7991273222740998


In [17]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.toPandas().head(3)

Unnamed: 0,userId,recommendations
0,28,"[(85, 5.939967155456543), (92, 5.1181178092956..."
1,26,"[(62, 5.905642986297607), (32, 5.5628299713134..."
2,27,"[(85, 5.086740016937256), (34, 4.4568171501159..."


In [18]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.toPandas().head(3)

Unnamed: 0,movieId,recommendations
0,31,"[(20, 4.2785234451293945), (12, 3.455365180969..."
1,85,"[(28, 5.939967155456543), (27, 5.0867400169372..."
2,65,"[(11, 3.328084945678711), (20, 1.9836831092834..."


In [33]:
recommendations_for_users = userRecs.select("userId", "recommendations.movieId")
recommendations_for_users.collect()

[Row(userId=28, movieId=[85, 92, 81, 12, 2, 89, 82, 47, 76, 11]),
 Row(userId=26, movieId=[62, 32, 7, 94, 23, 24, 22, 64, 60, 75]),
 Row(userId=27, movieId=[85, 34, 47, 80, 51, 33, 75, 19, 27, 83]),
 Row(userId=12, movieId=[46, 17, 64, 27, 30, 22, 32, 68, 16, 36]),
 Row(userId=22, movieId=[22, 94, 74, 30, 75, 51, 46, 7, 32, 62]),
 Row(userId=1, movieId=[55, 17, 83, 64, 68, 10, 46, 66, 8, 85]),
 Row(userId=13, movieId=[39, 93, 70, 83, 29, 72, 74, 8, 77, 53]),
 Row(userId=6, movieId=[34, 83, 64, 41, 47, 43, 74, 63, 67, 85]),
 Row(userId=16, movieId=[19, 51, 90, 54, 75, 71, 29, 27, 58, 47]),
 Row(userId=3, movieId=[51, 75, 22, 27, 80, 85, 77, 88, 39, 83]),
 Row(userId=20, movieId=[27, 52, 22, 30, 31, 17, 77, 96, 88, 53]),
 Row(userId=5, movieId=[17, 55, 27, 90, 30, 10, 46, 49, 68, 32]),
 Row(userId=19, movieId=[46, 90, 94, 98, 71, 74, 51, 54, 30, 19]),
 Row(userId=15, movieId=[46, 1, 53, 4, 3, 74, 31, 61, 98, 77]),
 Row(userId=17, movieId=[90, 46, 55, 17, 94, 30, 68, 32, 10, 64]),
 Row(us

In [38]:
json_rdd = recommendations_for_users.toJSON()
json_rdd.collect()

['{"userId":28,"movieId":[85,92,81,12,2,89,82,47,76,11]}',
 '{"userId":26,"movieId":[62,32,7,94,23,24,22,64,60,75]}',
 '{"userId":27,"movieId":[85,34,47,80,51,33,75,19,27,83]}',
 '{"userId":12,"movieId":[46,17,64,27,30,22,32,68,16,36]}',
 '{"userId":22,"movieId":[22,94,74,30,75,51,46,7,32,62]}',
 '{"userId":1,"movieId":[55,17,83,64,68,10,46,66,8,85]}',
 '{"userId":13,"movieId":[39,93,70,83,29,72,74,8,77,53]}',
 '{"userId":6,"movieId":[34,83,64,41,47,43,74,63,67,85]}',
 '{"userId":16,"movieId":[19,51,90,54,75,71,29,27,58,47]}',
 '{"userId":3,"movieId":[51,75,22,27,80,85,77,88,39,83]}',
 '{"userId":20,"movieId":[27,52,22,30,31,17,77,96,88,53]}',
 '{"userId":5,"movieId":[17,55,27,90,30,10,46,49,68,32]}',
 '{"userId":19,"movieId":[46,90,94,98,71,74,51,54,30,19]}',
 '{"userId":15,"movieId":[46,1,53,4,3,74,31,61,98,77]}',
 '{"userId":17,"movieId":[90,46,55,17,94,30,68,32,10,64]}',
 '{"userId":9,"movieId":[49,18,7,32,87,79,47,67,43,27]}',
 '{"userId":4,"movieId":[41,52,70,72,93,83,64,87,63,40