In [3]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=dcf50e60bf87361cd730acae35d617a6c0fb8930ddf872ddc9bf4301d3f3501c
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row, SparkSession

In [5]:
spark = SparkSession.builder \
        .master("local") \
        .appName("myApp") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.sql.session.timeout", "48h") \
        .getOrCreate()

In [7]:
lines = spark.read.text("./sample_data/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [8]:
# maxIter and regParam values to be tested
max_iters = [5, 10, 20]
reg_params = [0.1, 0.5, 1.0]

results = {}

In [9]:
# Loop for every maxIter and regParam value
for max_iter in max_iters:
    for reg_param in reg_params:
        als = ALS(maxIter=max_iter, regParam=reg_param, userCol="userId", itemCol="movieId", ratingCol="rating",
                  coldStartStrategy="drop")
        model = als.fit(training)

        predictions = model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)

        results[(max_iter, reg_param)] = rmse
        print(f"Root-mean-square error for maxIter={max_iter}, regParam={reg_param} = {rmse}")

Root-mean-square error for maxIter=5, regParam=0.1 = 0.9688205675431153
Root-mean-square error for maxIter=5, regParam=0.5 = 1.1297708518341942
Root-mean-square error for maxIter=5, regParam=1.0 = 1.3691869187312151
Root-mean-square error for maxIter=10, regParam=0.1 = 0.9235508437296419
Root-mean-square error for maxIter=10, regParam=0.5 = 1.128258748527846
Root-mean-square error for maxIter=10, regParam=1.0 = 1.3691738236594744
Root-mean-square error for maxIter=20, regParam=0.1 = 0.91753869584624
Root-mean-square error for maxIter=20, regParam=0.5 = 1.1280810932816783
Root-mean-square error for maxIter=20, regParam=1.0 = 1.3691738213469784


In [10]:
# The less the RMSE value, the better
best_params = min(results, key=results.get)
best_rmse = results[best_params]
print(f"\nBest hyperparameters: maxIter={best_params[0]}, regParam={best_params[1]} with RMSE={best_rmse}")


Best hyperparameters: maxIter=20, regParam=0.1 with RMSE=0.91753869584624


In [11]:
# Build the recommendation model using ALS on the training data with the best value for maxIter and regParam
als = ALS(maxIter=best_params[0], regParam=best_params[1], userCol="userId", itemCol="movieId", ratingCol="rating",
               coldStartStrategy="drop")
model = als.fit(training)

In [12]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 3.450938}, ...|
|    10|[{2, 3.2609217}, ...|
|     0|[{92, 3.2232068},...|
|     1|[{62, 3.134501}, ...|
|    21|[{29, 4.0552034},...|
|    11|[{27, 4.761196}, ...|
|    12|[{55, 4.374193}, ...|
|    22|[{75, 4.611357}, ...|
|     2|[{93, 4.6672235},...|
|    13|[{93, 3.3360484},...|
|     3|[{51, 4.277628}, ...|
|    23|[{32, 4.7151017},...|
|     4|[{2, 3.3353186}, ...|
|    24|[{69, 4.4623394},...|
|    14|[{29, 4.540037}, ...|
|     5|[{46, 4.235295}, ...|
|    15|[{46, 3.8849373},...|
|    25|[{47, 3.216118}, ...|
|    26|[{88, 4.6131635},...|
|     6|[{25, 4.015304}, ...|
+------+--------------------+
only showing top 20 rows



In [13]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 3.737557}, ...|
|     40|[{2, 3.4441624}, ...|
|     10|[{17, 3.5838134},...|
|     50|[{23, 3.75266}, {...|
|     80|[{11, 2.8122354},...|
|     70|[{8, 3.1574395}, ...|
|     60|[{8, 2.6789083}, ...|
|     90|[{17, 4.612008}, ...|
|     30|[{11, 4.652803}, ...|
|      0|[{28, 2.365326}, ...|
|     31|[{8, 2.7695265}, ...|
|     81|[{28, 4.0993695},...|
|     91|[{12, 2.7423503},...|
|      1|[{25, 2.0068498},...|
|     41|[{8, 3.4813125}, ...|
|     61|[{6, 2.1295094}, ...|
|     51|[{22, 4.5763817},...|
|     21|[{22, 2.3358488},...|
|     11|[{7, 1.4372313}, ...|
|     71|[{25, 2.955316}, ...|
+-------+--------------------+
only showing top 20 rows



In [14]:
# Generate top 10 movie recommendations for a specific set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{88, 4.6131635},...|
|    19|[{94, 3.33503}, {...|
|    29|[{46, 4.4626}, {9...|
+------+--------------------+



In [15]:
# Generate top 10 user recommendations for a specific set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{23, 4.0775037},...|
|     26|[{0, 2.1388037}, ...|
|     29|[{8, 4.7866273}, ...|
+-------+--------------------+



**Conclusion**
In conclusion, regarding the maxIter and regParam value, which value for each that made the lowest Root Mean Square Error (RMSE) value possible, the maxIter value of 20 and regParam value of 0.1 are the best values. 