In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connected[0m                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
[33m0% [Waiting for headers] [3 InRelease 14.2 kB/129 kB 11%] [Waiting for headers][0m                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

# Create Spark Session in localhost
spark = SparkSession.builder.master("local").\
    appName("ITESO-ALS").\
    getOrCreate()

# Set spark context
sc = spark.sparkContext
sc.setLogLevel("ERROR")

file_path = "/content/drive/MyDrive/sample_movielens_ratings.txt"
ratings_df = spark.read.format("csv") \
    .option("delimiter", "::") \
    .option("inferSchema", "true") \
    .option("header", "false") \
    .load(file_path) \
    .toDF("userId", "movieId", "rating", "timestamp")

ratings_df.show(n=10, truncate=False)

ratings_df.printSchema()
ratings_df.show(5)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|0     |2      |3     |
|0     |3      |1     |
|0     |5      |2     |
|0     |9      |4     |
|0     |11     |1     |
|0     |12     |2     |
|0     |15     |1     |
|0     |17     |1     |
|0     |19     |1     |
|0     |21     |1     |
+------+-------+------+
only showing top 10 rows



In [None]:
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

model = als.fit(ratings_df)

+------+---------------------------------------------------+
|userId|recommendations                                    |
+------+---------------------------------------------------+
|20    |[{22, 3.1700583}, {75, 3.0720527}, {77, 3.0376258}]|
|10    |[{92, 3.1328428}, {2, 2.9586518}, {49, 2.759566}]  |
|0     |[{2, 2.4901946}, {92, 2.362689}, {62, 2.3117182}]  |
|1     |[{22, 2.6123614}, {32, 2.5482397}, {90, 2.5390642}]|
|21    |[{29, 3.9293365}, {52, 3.860653}, {62, 3.4957798}] |
|11    |[{30, 4.499502}, {23, 4.305071}, {32, 4.2971106}]  |
|12    |[{46, 5.7581706}, {90, 4.6824193}, {55, 4.6630015}]|
|22    |[{75, 4.5056486}, {51, 4.279585}, {30, 4.2458215}] |
|2     |[{93, 4.3672304}, {8, 4.147073}, {83, 4.080389}]   |
|13    |[{93, 2.7201996}, {74, 2.5777538}, {8, 2.461563}]  |
|3     |[{30, 4.131316}, {51, 4.087223}, {69, 3.9870188}]  |
|23    |[{46, 5.764672}, {55, 4.7219243}, {90, 4.649307}]  |
|4     |[{2, 3.266941}, {29, 3.1258602}, {62, 3.10903}]    |
|24    |[{52, 4.3746667}

In [None]:
predictions = model.transform(ratings_df)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE): {rmse}")

In [None]:
user_recommendations = model.recommendForAllUsers(numItems=5)
user_recommendations.show(truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|28    |0      |3     |2.4182842 |
|28    |1      |1     |0.9200727 |
|28    |2      |4     |3.6259902 |
|28    |3      |1     |0.7575352 |
|28    |6      |1     |0.8285437 |
|28    |7      |1     |2.2961988 |
|28    |12     |5     |2.5802782 |
|28    |13     |2     |2.016483  |
|28    |14     |1     |1.6457486 |
|28    |15     |1     |1.1707748 |
|28    |17     |1     |1.1201245 |
|28    |19     |3     |2.3568447 |
|28    |20     |1     |0.9272383 |
|28    |23     |3     |2.7635028 |
|28    |24     |3     |2.186849  |
|28    |27     |1     |1.0844305 |
|28    |29     |1     |1.5763125 |
|28    |33     |1     |1.2678756 |
|28    |34     |1     |1.6342938 |
|28    |36     |1     |1.4977014 |
+------+-------+------+----------+
only showing top 20 rows

0.5841209737685337
