Dataset availability: https://www.kaggle.com/datasets/shubhammehta21/movie-lens-small-latest-dataset

### 1. Import libraries

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

### 2. Spark session

In [14]:
spark = SparkSession.builder.appName('recommender').getOrCreate()
df = spark.read.csv('ratings.csv', inferSchema= True, header = True)
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [15]:
df.show(3)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
+------+-------+------+---------+
only showing top 3 rows



In [16]:
df.describe().show()

+-------+------------------+----------------+------------------+--------------------+
|summary|            userId|         movieId|            rating|           timestamp|
+-------+------------------+----------------+------------------+--------------------+
|  count|            100836|          100836|            100836|              100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|1.2059460873684695E9|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|2.1626103599513078E8|
|    min|                 1|               1|               0.5|           828124615|
|    max|               610|          193609|               5.0|          1537799250|
+-------+------------------+----------------+------------------+--------------------+



### 3. Machine learning

In [17]:
train, test = df.randomSplit([0.8, 0.2])

In [18]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [19]:
model = als.fit(train)
predictions = model.transform(test)
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   191|    148|   5.0| 829760897|       NaN|
|   597|    471|   2.0| 941558175|     4.256|
|   602|    471|   4.0| 840876085| 4.9549994|
|   372|    471|   3.0| 874415126|  1.998034|
|   182|    471|   4.5|1054779644| 3.5710642|
|   500|    471|   1.0|1005528017| 2.3568454|
|    32|    471|   3.0| 856737165| 3.7859452|
|   469|    471|   5.0| 965425364| 3.4218524|
|   104|    471|   4.5|1238111129|  3.223849|
|   307|    833|   1.0|1186172725| 2.1758733|
|   307|   1088|   3.0|1186162146| 2.3771904|
|   391|   1088|   1.0|1030824424| 2.9887533|
|   509|   1088|   3.0|1435992808|  3.408918|
|   414|   1088|   3.0| 961514273| 3.3374028|
|   200|   1088|   4.0|1229887977| 3.2450008|
|   525|   1088|   4.5|1476478367| 4.0509048|
|   325|   1342|   4.0|1039396702|  3.736366|
|    19|   1342|   2.0| 965704952| 1.0655906|
|   387|   1342|   3.0|1131269228|

In [13]:
evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')
rmse = evaluator.evaluate(predictions)
print('RMSE:', rmse)

RMSE: nan


In [20]:
this_user = test.filter(test['userId'] == 12).select('userId', 'movieId')
this_user.show()

+------+-------+
|userId|movieId|
+------+-------+
|    12|    543|
|    12|   1265|
|    12|   1357|
|    12|   1405|
|    12|   2100|
|    12|   2694|
|    12|   2717|
|    12|   3967|
|    12|   5620|
|    12|   8533|
+------+-------+



In [21]:
recommendation_this_user = model.transform(this_user)
recommendation_this_user.show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    12|   1265|  4.798801|
|    12|   5620| 3.6503742|
|    12|   2694| 3.5698948|
|    12|   3967|  4.673934|
|    12|   1357| 4.3636947|
|    12|   8533|  5.194949|
|    12|   2717|  3.195086|
|    12|    543|  3.890683|
|    12|   1405| 2.5773635|
|    12|   2100| 3.6491144|
+------+-------+----------+



In [22]:
recommendation_this_user.orderBy('prediction', ascending=False).show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    12|   8533|  5.194949|
|    12|   1265|  4.798801|
|    12|   3967|  4.673934|
|    12|   1357| 4.3636947|
|    12|    543|  3.890683|
|    12|   5620| 3.6503742|
|    12|   2100| 3.6491144|
|    12|   2694| 3.5698948|
|    12|   2717|  3.195086|
|    12|   1405| 2.5773635|
+------+-------+----------+

