# <center> <img src="img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ingeniería en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 12**: Recommendation System with ALS

**Fecha**: 02/05/25

**Nombre del Estudiante**: Angel Ramirez, Roberto Osorno, Yochabel Cazares, Samuel Romero

**Profesor**: Pablo Camarillo Ramirez

In [31]:
import findspark
findspark.init()

In [32]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Recommender-Systems") \
    .master("spark://f04d2745dc57:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

### Creación del dataframe

In [33]:
from team_name.spark_utils import SparkUtils

movie_ratings_schema = SparkUtils.generate_schema([
    ("userId", "integer"),
    ("movieId", "integer"),
    ("rating", "float"),
    ("timestamp", "integer")
])

movie_ratings_df = spark.read \
                  .schema(movie_ratings_schema) \
                  .option("header", "false") \
                  .option("sep", "::") \
                  .csv("/home/jovyan/notebooks/data/sample_movielens_ratings.txt")

movie_ratings_df.printSchema()
movie_ratings_df.show(5, truncate=False)


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: integer (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|0     |2      |3.0   |1424380312|
|0     |3      |1.0   |1424380312|
|0     |5      |2.0   |1424380312|
|0     |9      |4.0   |1424380312|
|0     |11     |1.0   |1424380312|
+------+-------+------+----------+
only showing top 5 rows



                                                                                

### Configurar el modelo ALS

In [34]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol = "rating",
    maxIter=10, 
    regParam=0.1, 
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)

### Training

In [35]:
model = als.fit(movie_ratings_df)

                                                                                

### Predicciones

In [36]:
# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=5)

# Show recommendations
user_recommendations.show(truncate=False)



+------+-------------------------------------------------------------------------------------+
|userId|recommendations                                                                      |
+------+-------------------------------------------------------------------------------------+
|0     |[{92, 2.5840385}, {2, 2.316802}, {62, 2.2325232}, {25, 2.157748}, {93, 2.1528697}]   |
|10    |[{92, 2.768342}, {2, 2.6728113}, {93, 2.6242015}, {25, 2.5927775}, {49, 2.5867324}]  |
|20    |[{22, 3.5597918}, {68, 3.1278815}, {94, 3.084497}, {51, 3.0827737}, {77, 3.0246763}] |
|1     |[{22, 2.9029422}, {68, 2.630123}, {77, 2.5238972}, {62, 2.501064}, {90, 2.4797387}]  |
|11    |[{32, 5.082464}, {18, 4.705235}, {30, 4.6826043}, {27, 4.5120797}, {8, 4.229401}]    |
|21    |[{29, 4.320379}, {52, 4.2401457}, {76, 3.716108}, {63, 3.5063725}, {53, 3.4859684}]  |
|22    |[{51, 4.458179}, {75, 4.418395}, {22, 4.118836}, {74, 4.1007586}, {88, 4.0829244}]   |
|2     |[{93, 4.2531066}, {83, 4.1469526}, {8, 4.0

                                                                                

### Movies metadata

In [40]:
from pyspark.sql.functions import explode

movies = [
    (2, "Challengers"),
    (4, "Cars 2"),
    (6, "Eternal Sunshine"),
    (8, "Whiplash"),
    (10, "Interstellar")
]

movies_schema = SparkUtils.generate_schema([("movieId", "integer"), ("title", "string")])
movies_df = spark.createDataFrame(movies, movies_schema)


# Explode recommendations for easier reading
recommendations = user_recommendations.select("userId", explode("recommendations").alias("rec"))
recommendations = recommendations.join(movies_df, recommendations.rec.movieId == movies_df.movieId).select("movieId", "title", "rec.rating")

# Show user-song recommendations with titles
recommendations.show(truncate=False)




+-------+-----------+---------+
|movieId|title      |rating   |
+-------+-----------+---------+
|2      |Challengers|2.316802 |
|2      |Challengers|2.6728113|
|8      |Whiplash   |4.229401 |
|8      |Whiplash   |4.0344357|
|2      |Challengers|3.6790943|
|2      |Challengers|2.7636726|
|2      |Challengers|2.95915  |
|8      |Whiplash   |3.2529428|
|8      |Whiplash   |4.144908 |
|2      |Challengers|3.2185159|
+-------+-----------+---------+



                                                                                

### Predicciones de todos los datos

In [41]:
predictions = model.transform(movie_ratings_df)
predictions.show(truncate=False)

+------+-------+------+----------+----------+
|userId|movieId|rating|timestamp |prediction|
+------+-------+------+----------+----------+
|22    |0      |1.0   |1424380312|0.96697557|
|22    |3      |2.0   |1424380312|1.6326257 |
|22    |5      |2.0   |1424380312|2.0366673 |
|22    |6      |2.0   |1424380312|2.2972772 |
|22    |9      |1.0   |1424380312|1.5513803 |
|22    |10     |1.0   |1424380312|1.4349127 |
|22    |11     |1.0   |1424380312|1.2901659 |
|22    |13     |1.0   |1424380312|1.617328  |
|22    |14     |1.0   |1424380312|1.389045  |
|22    |16     |1.0   |1424380312|0.7093756 |
|22    |18     |3.0   |1424380312|3.0116072 |
|22    |19     |1.0   |1424380312|1.4644071 |
|22    |22     |5.0   |1424380312|4.118836  |
|22    |25     |1.0   |1424380312|0.97840077|
|22    |26     |1.0   |1424380312|1.1323681 |
|22    |29     |3.0   |1424380312|3.2431226 |
|22    |30     |5.0   |1424380312|3.9942718 |
|22    |32     |4.0   |1424380312|3.217519  |
|22    |33     |1.0   |1424380312|

### Evaluar el modelo

In [42]:
from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

[Stage 339:>                                                        (0 + 1) / 1]

Root-mean-square error (RMSE) = 0.5691166521341573


                                                                                

In [43]:
sc.stop()