In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('rec').getOrCreate()

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [4]:
data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)

In [5]:
data.head()

Row(movieId=2, rating=3.0, userId=0)

In [6]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [7]:
(training, test) = data.randomSplit([0.8,0.2])

In [8]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId',ratingCol='rating')
model = als.fit(training)

In [9]:
predictions = model.transform(test)

In [10]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    27|  1.6656848|
|     31|   1.0|     5|-0.29639706|
|     31|   1.0|     4|  1.7770865|
|     31|   3.0|     8|  2.0436308|
|     31|   2.0|    25|   3.166287|
|     85|   1.0|    12|  2.0919423|
|     85|   3.0|     6|  3.2639544|
|     85|   2.0|    20|  1.3479931|
|     85|   5.0|     8|  3.9999328|
|     65|   2.0|     3|  1.5706865|
|     53|   1.0|    12|-0.19964105|
|     78|   1.0|    17|  1.0990571|
|     81|   1.0|    16|   1.847206|
|     81|   2.0|     5|  2.1065845|
|     81|   1.0|    19|  1.0157254|
|     81|   1.0|     7|  2.3927019|
|     81|   2.0|    29|  3.2766488|
|     28|   1.0|    14|  1.3510399|
|     28|   1.0|     0|  3.7026272|
|     28|   5.0|    18|-0.46800438|
+-------+------+------+-----------+
only showing top 20 rows



In [11]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))

Root-mean-square error = 1.8929858472668317


In [12]:
single_user = test.filter(test['userID']==11).select(['movieId','userId'])

In [13]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      0|    11|
|      9|    11|
|     10|    11|
|     12|    11|
|     13|    11|
|     21|    11|
|     50|    11|
|     61|    11|
|     69|    11|
|     75|    11|
|     79|    11|
|     89|    11|
+-------+------+



In [14]:
reccomendations = model.transform(single_user)

In [15]:
reccomendations.orderBy('prediction',ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     50|    11|  3.4985473|
|     12|    11|    3.26057|
|     13|    11|  2.4041662|
|     75|    11|   2.115788|
|     79|    11|   2.001026|
|     10|    11|  1.2954662|
|     69|    11|  1.0468949|
|     21|    11|  1.0035304|
|     61|    11| 0.93098325|
|      0|    11|  0.6141173|
|      9|    11|-0.45555624|
|     89|    11| -1.3895831|
+-------+------+-----------+

