In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!sudo apt update

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Connecting to security.ubuntu.com (185.125.190.82)] [Waiting for headers] [[0m                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [2 InRelease 15.6 kB/128 kB 12%] [Waiting for headers] [Waiting for headers][0m[33m0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connectin[0m                                                                               Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
[33m0% [Waiting for headers] [4 InRelease 14.2 kB/129 kB 11%] [Waiting for headers][0m                                                                              

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
# Create Spark Session in localhost
spark = SparkSession.builder.master("local").\
    appName("ALS-Recommendation").\
    getOrCreate()

# Set spark context
sc = spark.sparkContext
sc.setLogLevel("ERROR")

movies_df = spark.read.text("/content/sample_movielens_ratings.txt")

movies_df = movies_df.withColumn("userID", split(movies_df["value"], "::").getItem(0)) \
.withColumn("movieID", split(movies_df["value"], "::").getItem(1)) \
.withColumn("rating", split(movies_df["value"], "::").getItem(2)) \
.withColumn("timestamp", split(movies_df["value"], "::").getItem(3))

movies_df = movies_df.drop("value")
movies_df = movies_df.drop("timestamp")

movies_df = movies_df.withColumn("userID", movies_df["userID"].cast("int"))
movies_df = movies_df.withColumn("movieID", movies_df["movieID"].cast("int"))
movies_df = movies_df.withColumn("rating", movies_df["rating"].cast("int"))

movies_df.show(n=10, truncate=False)

+------+-------+------+
|userID|movieID|rating|
+------+-------+------+
|0     |2      |3     |
|0     |3      |1     |
|0     |5      |2     |
|0     |9      |4     |
|0     |11     |1     |
|0     |12     |2     |
|0     |15     |1     |
|0     |17     |1     |
|0     |19     |1     |
|0     |21     |1     |
+------+-------+------+
only showing top 10 rows



In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

als = ALS(maxIter=10, regParam=0.01, rank=5, userCol="userID",
itemCol="movieID", ratingCol="rating")

model = als.fit(movies_df)

userRecs = model.recommendForAllUsers(numItems=3)

In [6]:
predictions = model.transform(movies_df)
predictions.show(truncate=False)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

+------+-------+------+----------+
|userID|movieID|rating|prediction|
+------+-------+------+----------+
|0     |2      |3     |2.7966778 |
|0     |3      |1     |0.9393283 |
|0     |5      |2     |1.3681284 |
|0     |9      |4     |2.0185423 |
|0     |11     |1     |1.6565835 |
|0     |12     |2     |2.0901403 |
|0     |15     |1     |0.86459875|
|0     |17     |1     |0.80446655|
|0     |19     |1     |1.454981  |
|0     |21     |1     |1.4233297 |
|0     |23     |1     |1.3722543 |
|0     |26     |3     |2.080988  |
|0     |27     |1     |0.51804197|
|0     |28     |1     |1.7100291 |
|0     |29     |1     |1.8323538 |
|0     |30     |1     |0.6214356 |
|0     |31     |1     |1.3846567 |
|0     |34     |1     |0.86154556|
|0     |37     |1     |1.4897687 |
|0     |41     |2     |2.1819613 |
+------+-------+------+----------+
only showing top 20 rows



In [7]:
rmse = evaluator.evaluate(predictions)
print(rmse)

0.5183831336575359
