In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("rec").getOrCreate()
from pyspark.ml.recommendation import ALS

In [4]:
data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [6]:
train,test=data.randomSplit([0.8,0.2])
als=ALS(maxIter=5,regParam=0.01,userCol='userId',itemCol='movieId',ratingCol='rating')
model=als.fit(train)

In [7]:
predictions=model.transform(test)

In [8]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    26| -1.0115454|
|     31|   1.0|    13|  1.8605084|
|     31|   2.0|    25|  1.3190647|
|     31|   1.0|    24|  4.8860526|
|     31|   1.0|    29|  0.7075072|
|     31|   3.0|    14|    4.28348|
|     31|   1.0|     0|  4.6426578|
|     85|   3.0|     1|  1.3994323|
|     85|   5.0|    16| 0.10381964|
|     85|   4.0|     7|  4.1721296|
|     85|   1.0|    29|-0.39343032|
|     85|   1.0|     2|  0.7572837|
|     65|   1.0|    28| -1.9311928|
|     65|   2.0|     5|    2.25643|
|     65|   2.0|    15|  1.6717451|
|     65|   1.0|     4|  0.5875854|
|     53|   1.0|     6| -1.4423766|
|     53|   3.0|    20|    1.27243|
|     53|   5.0|     8|  0.8222858|
|     53|   1.0|    23| 0.64139014|
+-------+------+------+-----------+
only showing top 20 rows



In [11]:
evaluator=RegressionEvaluator(predictionCol='prediction',labelCol='rating',metricName='rmse')
rmse=evaluator.evaluate(predictions)
rmse

1.6927579164790842

In [13]:
#supply recommendation to user
single_user=test.filter(test['userId']==11).select('movieId','userId')
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      9|    11|
|     16|    11|
|     18|    11|
|     19|    11|
|     22|    11|
|     38|    11|
|     41|    11|
|     47|    11|
|     51|    11|
|     78|    11|
|     86|    11|
+-------+------+



In [15]:
recommendations=model.transform(single_user)
recommendations.orderBy('prediction').show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     47|    11|-0.59421873|
|     86|    11|   0.626647|
|     78|    11| 0.68670565|
|     41|    11|  2.1891606|
|     22|    11|   2.404005|
|     19|    11|  2.4945083|
|     18|    11|  3.1545737|
|     16|    11|  3.6887546|
|     38|    11|  3.7741508|
|      9|    11|  3.8208942|
|     51|    11|   4.177466|
+-------+------+-----------+



In [21]:
#another method
predictions.filter(test['userId']==11).orderBy('prediction').select('movieId','userId','prediction').show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     47|    11|-0.59421873|
|     86|    11|   0.626647|
|     78|    11| 0.68670565|
|     41|    11|  2.1891606|
|     22|    11|   2.404005|
|     19|    11|  2.4945083|
|     18|    11|  3.1545737|
|     16|    11|  3.6887546|
|     38|    11|  3.7741508|
|      9|    11|  3.8208942|
|     51|    11|   4.177466|
+-------+------+-----------+

