In [11]:
import findspark
findspark.init('/home/nick/spark-3.0.1-bin-hadoop2.7')

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS # Alternating least Squares
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import countDistinct

spark = SparkSession.builder.appName("recommender system").getOrCreate()

In [12]:
movie_data = spark.read.csv('Recommender_Systems/movielens_ratings.csv', inferSchema=True, header=True)

In [17]:
movie_data.show()
movie_data.groupby('rating').count().show()
movie_data.select(countDistinct('userId')).show()
movie_data.describe().show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows

+------+-----+
|rating|count|
+------+-----+
|   1.0|  941|
|   4.0|   99|
|   3.0|  179|
|   2.0|  207|
|   5.0|   75|
+------+-----+

+----------------------+
|count(DISTINCT userId)|
+----------------------+
|                    30|
+----------------------+

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-

In [39]:
train_data, test_data = movie_data.randomSplit([0.8,0.2])

In [40]:
als = ALS(maxIter=10, regParam=0.01,  userCol='userId', itemCol='movieId', ratingCol='rating')

In [41]:
model = als.fit(train_data)

In [42]:
predictions = model.transform(test_data)

In [43]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    13|   0.612427|
|     31|   1.0|     4|  0.6098734|
|     31|   3.0|     8|  1.8484888|
|     31|   3.0|    14| 0.79039633|
|     31|   1.0|    18|  0.9149922|
|     85|   1.0|    28|-0.67487264|
|     85|   1.0|    13|  2.5853167|
|     85|   5.0|    16|  4.8508406|
|     85|   1.0|    23|  2.2312508|
|     85|   1.0|    25|   5.131492|
|     85|   1.0|     2| -0.6372662|
|     65|   1.0|    28| -1.3168619|
|     53|   3.0|    13|   1.686669|
|     53|   1.0|     6|  2.0852034|
|     53|   3.0|    20|  1.3630657|
|     53|   5.0|    21|  2.2151296|
|     78|   1.0|    28|   1.169109|
|     78|   1.0|     4|    1.18008|
|     34|   3.0|     3|-0.44644502|
|     34|   1.0|    17| -1.7349743|
+-------+------+------+-----------+
only showing top 20 rows



In [44]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [45]:
rmse = evaluator.evaluate(predictions)
rmse

1.7977467704453953

In [55]:
single_user = test_data.filter(test_data['userId']==11).select(['movieId','userId'])
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     13|    11|
|     19|    11|
|     20|    11|
|     39|    11|
|     48|    11|
|     50|    11|
|     51|    11|
|     70|    11|
|     81|    11|
|     90|    11|
+-------+------+



In [56]:
recommendations = model.transform(single_user)

In [57]:
recommendations.orderBy('prediction', ascending=False).show()

+-------+------+------------+
|movieId|userId|  prediction|
+-------+------+------------+
|     90|    11|    5.386259|
|     39|    11|    2.104312|
|     81|    11|   1.7430375|
|     13|    11|   1.6104313|
|     19|    11|   1.4317605|
|     48|    11|   1.0173107|
|     70|    11|  0.21902251|
|     50|    11|-0.016211554|
|     20|    11|   -1.353533|
|     51|    11|  -2.4360964|
+-------+------+------------+

