# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Code Lab 12: Recommendation System with ALS** </center>

---
**Equipo**:
- Luis Raúl Acosta Mendoza
- Samantha Abigail Quintero Valadez 
- Arturo Benjamin Vergara Romo

**Profesor**: Dr. Pablo Camarillo Ramirez

In [117]:
import findspark
findspark.init()

In [118]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Lab12-Recommendation") \
    .master("spark://e0047bbfd1d0:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

### Data processing

In [119]:
from gatubelxs.spark_utils import SparkUtils

schema = SparkUtils.generate_schema([("userId", "integer"), ("movieId", "integer"), ("rating", "integer"), ("timestamp", "string")])

movies_df = spark \
            .read \
            .schema(schema) \
            .format("csv") \
            .option("delimiter", "::") \
            .load("/home/jovyan/notebooks/data/sample_movielens_ratings.txt")

In [120]:
movies_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|     3|1424380312|
|     0|      3|     1|1424380312|
|     0|      5|     2|1424380312|
|     0|      9|     4|1424380312|
|     0|     11|     1|1424380312|
+------+-------+------+----------+
only showing top 5 rows



                                                                                

### Configure ALS Model

In [121]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    maxIter=10,
    regParam=0.05,
    rank=5,
    coldStartStrategy="drop"
)

### Training

In [122]:
model = als.fit(movies_df)

                                                                                

### Predictions

In [123]:
user_recommendations = model.recommendForAllUsers(numItems=5)
user_recommendations.show(5, truncate=False)



+------+-------------------------------------------------------------------------------------+
|userId|recommendations                                                                      |
+------+-------------------------------------------------------------------------------------+
|20    |[{22, 3.8580954}, {68, 3.4262178}, {77, 3.3351588}, {51, 3.3038273}, {75, 3.2498941}]|
|10    |[{93, 3.0467818}, {92, 3.0068407}, {2, 2.9190967}, {46, 2.9074593}, {12, 2.8037267}] |
|0     |[{92, 2.7708678}, {2, 2.4770694}, {93, 2.4511714}, {62, 2.2976608}, {25, 2.2362418}] |
|1     |[{22, 3.1412663}, {68, 2.8987775}, {77, 2.7669766}, {62, 2.6009793}, {90, 2.5715427}]|
|21    |[{29, 4.6182933}, {52, 4.4885855}, {76, 4.272509}, {70, 3.8382668}, {63, 3.7778318}] |
+------+-------------------------------------------------------------------------------------+
only showing top 5 rows



                                                                                

In [124]:
from pyspark.sql.functions import explode
recommendations = user_recommendations.select("userId", explode("recommendations").alias("rec"))
recommendations = recommendations.join(movies_df.alias("movies"), recommendations.rec.movieId == movies_df.movieId).select("movies.userId", "movieId", "rec.rating")

recommendations.show(10, truncate=False)

                                                                                

+------+-------+---------+
|userId|movieId|rating   |
+------+-------+---------+
|29    |22     |3.8580954|
|27    |22     |3.8580954|
|26    |22     |3.8580954|
|25    |22     |3.8580954|
|23    |22     |3.8580954|
|22    |22     |3.8580954|
|21    |22     |3.8580954|
|20    |22     |3.8580954|
|18    |22     |3.8580954|
|17    |22     |3.8580954|
+------+-------+---------+
only showing top 10 rows



In [125]:
predictions = model.transform(movies_df)
predictions.show(20, truncate=False)

                                                                                

+------+-------+------+----------+----------+
|userId|movieId|rating|timestamp |prediction|
+------+-------+------+----------+----------+
|28    |0      |3     |1424380312|2.7663298 |
|28    |1      |1     |1424380312|1.7222251 |
|28    |2      |4     |1424380312|3.6112382 |
|28    |3      |1     |1424380312|0.75339115|
|28    |6      |1     |1424380312|0.6788493 |
|28    |7      |1     |1424380312|0.97086036|
|28    |12     |5     |1424380312|3.7963262 |
|28    |13     |2     |1424380312|1.4655552 |
|28    |14     |1     |1424380312|1.0429771 |
|28    |15     |1     |1424380312|1.3543348 |
|28    |17     |1     |1424380312|1.0654027 |
|28    |19     |3     |1424380312|2.7879913 |
|28    |20     |1     |1424380312|1.7845364 |
|28    |23     |3     |1424380312|2.2558138 |
|28    |24     |3     |1424380312|2.542177  |
|28    |27     |1     |1424380312|0.60613364|
|28    |29     |1     |1424380312|0.9979563 |
|28    |33     |1     |1424380312|1.6769428 |
|28    |34     |1     |1424380312|

### Evaluate Model

In [126]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    metricName = "rmse",
    labelCol = "rating",
    predictionCol = "prediction"
)

rmse = evaluator.evaluate(predictions)

print(f"Root-mean-square error = {rmse}")

                                                                                

Root-mean-square error = 0.5246847912282775


In [116]:
sc.stop()