# Recommendation Systems using ALS

## Initialization

In [1]:
# import findspark
import findspark
findspark.init()

In [2]:
# import functions
from pyspark.sql import functions as F

In [3]:
# import SparkSession
from pyspark.sql import SparkSession

# create session
spark = SparkSession \
    .builder \
    .appName("Python Spark Recommendation Systems Example") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x000000C2F99FBD68>


In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

## Preprocessing

In [5]:
ratings = spark.read.csv("item_ratings.csv", header=True, inferSchema=True)
ratings.show()

+----------+-------+------+-------------------+
|    userId|movieId|rating|          timestamp|
+----------+-------+------+-------------------+
|39032b0f7a|      1|   3.0|1997-11-06 06:36:16|
|39032b0f7a|     17|   3.0|1997-11-06 05:26:18|
|39032b0f7a|     21|   3.0|2000-10-02 07:45:30|
|39032b0f7a|     34|   4.0|1997-11-06 05:45:08|
|39032b0f7a|     62|   4.5|2003-06-18 04:09:17|
|39032b0f7a|    140|   4.0|1998-11-18 22:04:03|
|39032b0f7a|    144|   4.0|1997-11-06 06:46:48|
|39032b0f7a|    150|   4.0|2005-03-20 08:25:29|
|39032b0f7a|    237|   4.0|1997-11-06 05:57:16|
|39032b0f7a|    260|   4.0|1998-11-18 22:04:03|
|39032b0f7a|    281|   3.0|1997-11-06 05:56:10|
|39032b0f7a|    318|   5.0|1998-11-18 21:52:09|
|39032b0f7a|    329|   3.0|1998-07-26 21:58:28|
|39032b0f7a|    339|   3.0|2001-08-15 09:59:00|
|39032b0f7a|    349|   3.5|2014-01-14 04:37:55|
|39032b0f7a|    356|   5.0|1997-11-09 01:32:51|
|39032b0f7a|    357|   4.0|1997-11-09 01:43:58|
|39032b0f7a|    361|   4.0|2002-02-17 03

In [6]:
ratings.createOrReplaceTempView("movies")

In [7]:
new_id = spark.sql("SELECT DISTINCT userId FROM movies")
new_id.show()

+----------+
|    userId|
+----------+
|9757c7c989|
|3d197661f6|
|9b2762e59a|
|c52afe8592|
|e7cec00b33|
|dc4860fa10|
|540152b69d|
|d731228ef7|
|ad1d76d5f3|
|5266f376e5|
|14a97ae2c1|
|fa1c25f4ed|
|820fd9f6ab|
|dbcd17c098|
|8fd865c0a9|
|cc683ef8f7|
|c7f3b4f1f9|
|02f9cfbf86|
|7fdcecb392|
|9e95a9af82|
+----------+
only showing top 20 rows



In [8]:
new_id.createOrReplaceTempView("newId")

In [9]:
new_user_id = spark.sql("SELECT userId, ROW_NUMBER() OVER (ORDER BY userId) AS user_id FROM newId")
new_user_id.show()

+----------+-------+
|    userId|user_id|
+----------+-------+
|0011a9baea|      1|
|00be0326d6|      2|
|00d8a48617|      3|
|0116910e54|      4|
|0165b89cac|      5|
|01da6d8da5|      6|
|01ebe04c6d|      7|
|01f5bfd446|      8|
|0228ccbd55|      9|
|022e78bf03|     10|
|02839371ac|     11|
|02f9cfbf86|     12|
|02fe0bb9bb|     13|
|0331949b45|     14|
|0484a2df5c|     15|
|050c1a5781|     16|
|052ed67c6f|     17|
|0544a57e31|     18|
|055188f003|     19|
|05f38ec2d0|     20|
+----------+-------+
only showing top 20 rows



In [10]:
ratings = ratings.join(new_user_id, ratings.userId == new_user_id.userId)
ratings.show()

+----------+-------+------+-------------------+----------+-------+
|    userId|movieId|rating|          timestamp|    userId|user_id|
+----------+-------+------+-------------------+----------+-------+
|3d197661f6|      1|   4.0|2015-09-08 15:56:12|3d197661f6|    230|
|3d197661f6|      2|   3.0|2015-09-08 03:57:58|3d197661f6|    230|
|3d197661f6|      6|   3.5|2015-09-08 03:08:02|3d197661f6|    230|
|3d197661f6|     10|   3.0|2015-09-08 14:46:10|3d197661f6|    230|
|3d197661f6|     13|   3.0|2015-09-08 15:02:32|3d197661f6|    230|
|3d197661f6|     16|   4.0|2015-09-08 02:54:44|3d197661f6|    230|
|3d197661f6|     32|   4.0|2015-09-08 04:52:08|3d197661f6|    230|
|3d197661f6|     34|   3.5|2015-09-08 15:33:18|3d197661f6|    230|
|3d197661f6|     44|   2.0|2015-09-08 13:44:46|3d197661f6|    230|
|3d197661f6|     47|   4.0|2015-09-08 04:48:31|3d197661f6|    230|
|3d197661f6|     48|   3.0|2015-09-08 14:56:13|3d197661f6|    230|
|3d197661f6|     50|   4.0|2015-09-08 15:54:46|3d197661f6|    

In [11]:
ratings = ratings.select('user_id', 'movieId', 'rating')

In [12]:
ratings = ratings.withColumnRenamed('user_id', 'userId')
ratings.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|   230|      1|   4.0|
|   230|      2|   3.0|
|   230|      6|   3.5|
|   230|     10|   3.0|
|   230|     13|   3.0|
|   230|     16|   4.0|
|   230|     32|   4.0|
|   230|     34|   3.5|
|   230|     44|   2.0|
|   230|     47|   4.0|
|   230|     48|   3.0|
|   230|     50|   4.0|
|   230|     62|   3.0|
|   230|    110|   3.5|
|   230|    111|   5.0|
|   230|    150|   3.5|
|   230|    153|   2.5|
|   230|    158|   2.5|
|   230|    215|   4.0|
|   230|    216|   1.5|
+------+-------+------+
only showing top 20 rows



In [13]:
ratings.count()

458970

## Training Model

In [14]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [15]:
# build the recommendation model using ALS on the training data
# note that we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [16]:
# evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8542321969167205


In [17]:
# generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[6896, 7.4664564...|
|   463|[[4272, 7.322896]...|
|   833|[[26578, 7.628948...|
|   496|[[8753, 7.0246115...|
|   148|[[4624, 10.078892...|
|   540|[[4256, 6.3422856...|
|   392|[[4256, 7.5171585...|
|   243|[[4624, 8.467705]...|
|   623|[[4278, 6.685036]...|
|   737|[[66596, 6.249056...|
|    31|[[146327, 8.65950...|
|   516|[[4256, 6.5958443...|
|   580|[[147326, 5.93386...|
|   251|[[4256, 8.071559]...|
|   451|[[6452, 6.499717]...|
|    85|[[47084, 5.214213...|
|   137|[[4256, 7.368866]...|
|   808|[[4278, 6.868102]...|
|    65|[[4664, 8.059678]...|
|   458|[[3645, 9.144505]...|
+------+--------------------+
only showing top 20 rows



In [18]:
# generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[776, 5.290246],...|
|   4900|[[387, 5.982214],...|
|   5300|[[123, 5.9196143]...|
|   6620|[[98, 5.716596], ...|
|   7340|[[443, 5.2419133]...|
|  30970|[[783, 8.108435],...|
|  32460|[[153, 7.053493],...|
|  54190|[[87, 5.987054], ...|
|  57370|[[776, 7.081197],...|
|  83250|[[801, 6.047843],...|
|    471|[[757, 5.6838865]...|
|   1591|[[776, 5.812383],...|
|   1342|[[776, 7.763584],...|
|   2122|[[165, 7.81], [26...|
|   2142|[[757, 8.350384],...|
|   7982|[[757, 10.738288]...|
|  33722|[[757, 8.286456],...|
|  44022|[[774, 5.0817885]...|
| 141422|[[87, 9.044266], ...|
| 144522|[[776, 5.3771777]...|
+-------+--------------------+
only showing top 20 rows



## References

1. https://grouplens.org/datasets/learning-from-sets-of-items-2019/
2. https://spark.apache.org/docs/latest/ml-collaborative-filtering.html