In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Waiting for headers] [1[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Waiting for headers] [C[0m                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Waiting for headers] [W[0m                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, FloatType, IntegerType, StringType
from pyspark.sql.functions import col,isnan, when, count

# Create Spark Session in localhost
spark = SparkSession.builder.master("local").\
    appName("ITESO-ALS").\
    getOrCreate()

# Set spark context
sc = spark.sparkContext
sc.setLogLevel("ERROR")


movies_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", IntegerType(), True),
])

movies_df = spark.read.format("csv").\
    option("delimiter", "::").\
    option("mode", "permissive").\
    option("path", "/content/sample_movielens_ratings.txt").\
    schema(movies_schema).\
    load()

movies_df.printSchema()
movies_df.show()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     0|      2|     3|
|     0|      3|     1|
|     0|      5|     2|
|     0|      9|     4|
|     0|     11|     1|
|     0|     12|     2|
|     0|     15|     1|
|     0|     17|     1|
|     0|     19|     1|
|     0|     21|     1|
|     0|     23|     1|
|     0|     26|     3|
|     0|     27|     1|
|     0|     28|     1|
|     0|     29|     1|
|     0|     30|     1|
|     0|     31|     1|
|     0|     34|     1|
|     0|     37|     1|
|     0|     41|     2|
+------+-------+------+
only showing top 20 rows



In [4]:
from pyspark.ml.recommendation import ALS

# Configure ALS model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    maxIter=10,
    regParam=0.1,
    rank=5,
    coldStartStrategy="drop"
)

# Train the model
model = als.fit(movies_df)

# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=3)
user_recommendations.show(truncate=False)

+------+---------------------------------------------------+
|userId|recommendations                                    |
+------+---------------------------------------------------+
|20    |[{22, 3.4345348}, {51, 3.23853}, {75, 3.02112}]    |
|10    |[{93, 2.7857873}, {2, 2.4791286}, {25, 2.4388287}] |
|0     |[{93, 2.2207298}, {29, 2.2032633}, {25, 2.1398427}]|
|1     |[{22, 2.8081403}, {90, 2.6688783}, {51, 2.4775007}]|
|21    |[{52, 4.1361566}, {29, 4.1086993}, {2, 3.5641809}] |
|11    |[{32, 5.1655707}, {49, 4.9204955}, {23, 4.6741686}]|
|12    |[{46, 5.717836}, {55, 4.7345004}, {90, 4.419613}]  |
|22    |[{75, 4.4798894}, {51, 4.458965}, {22, 4.1424575}] |
|2     |[{93, 4.215927}, {8, 3.9464414}, {83, 3.833647}]   |
|13    |[{93, 2.7292414}, {74, 2.6835606}, {2, 2.659275}]  |
|3     |[{51, 3.9032552}, {75, 3.8359852}, {30, 3.7657852}]|
|23    |[{46, 5.5694184}, {55, 4.775144}, {90, 4.617609}]  |
|4     |[{29, 3.2664168}, {52, 3.2151625}, {2, 3.1123965}] |
|24    |[{52, 4.379621},

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator

predictions = model.transform(movies_df)
predictions.show(truncate=False)

# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
metricName="rmse", labelCol="rating", predictionCol="prediction")

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(rmse)


+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|28    |0      |3     |2.0495927 |
|28    |1      |1     |1.0879441 |
|28    |2      |4     |3.208639  |
|28    |3      |1     |0.82585263|
|28    |6      |1     |0.96035266|
|28    |7      |1     |2.007586  |
|28    |12     |5     |2.5029192 |
|28    |13     |2     |1.8718919 |
|28    |14     |1     |1.4563401 |
|28    |15     |1     |1.296754  |
|28    |17     |1     |1.0391803 |
|28    |19     |3     |2.8770504 |
|28    |20     |1     |0.78387463|
|28    |23     |3     |2.5050588 |
|28    |24     |3     |2.1648526 |
|28    |27     |1     |1.8152664 |
|28    |29     |1     |1.8170805 |
|28    |33     |1     |1.8228741 |
|28    |34     |1     |2.2528431 |
|28    |36     |1     |1.1669419 |
+------+-------+------+----------+
only showing top 20 rows

0.5869116255989768
