# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Lab 12** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Integrantes**:
- Lorena Ruelas Gaytán
- Yael Alejandro Rodríguez Barreto
- Ximena Isaac Horta
- Alberto Renteria Camacho

In [7]:
import findspark
findspark.init()

#### Spark Conexion


In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Recommender-Systems") \
    .master("spark://9c456350d25c:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Data preparation

In [9]:
from team_name.spark_utils import SparkUtils

data_path = "/home/jovyan/notebooks/data/movie_scores/sample_movielens_ratings.txt"

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([
    ("userId", "integer"),
    ("movieId", "integer"),
    ("rating","integer"),
    ("date","timestamp")
])

rating_df = spark.read \
        .schema(schema) \
        .option("header", "false") \
        .option("delimiter", "::") \
        .csv(data_path)

rating_df = rating_df.drop("date")
rating_df.show(10)
rating_df.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     0|      2|     3|
|     0|      3|     1|
|     0|      5|     2|
|     0|      9|     4|
|     0|     11|     1|
|     0|     12|     2|
|     0|     15|     1|
|     0|     17|     1|
|     0|     19|     1|
|     0|     21|     1|
+------+-------+------+
only showing top 10 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)



                                                                                

#### Configure ALS model

In [10]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId", 
    itemCol="movieId", 
    ratingCol="rating", 
    maxIter=25, 
    regParam=0.1, 
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)

#### Training

In [11]:
model_als = als.fit(rating_df)

                                                                                

#### Predictions

In [12]:
user_recommendations = model_als.recommendForAllUsers(numItems=10)
user_recommendations.show(truncate=False)



+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                          |
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0     |[{92, 2.5959032}, {2, 2.3083057}, {62, 2.2538722}, {25, 2.1969426}, {93, 2.1477091}, {89, 2.0750296}, {41, 2.0114007}, {12, 1.9659108}, {4, 1.8798398}, {40, 1.8344107}] |
|10    |[{92, 2.7782347}, {2, 2.6499631}, {25, 2.6480808}, {93, 2.6063313}, {49, 2.5840454}, {46, 2.5223231}, {12, 2.5001009}, {89, 2.485821}, {91, 2.3178747}, {81, 2.299457}]  |
|20    |[{22, 3.561147}, {68, 3.113898}, {94, 3.0776765}, {51, 3.074668}, {77, 3.0353088}, {75, 3.0239546

                                                                                

#### Predictions for all data

In [13]:
predictions = model_als.transform(rating_df)
predictions.show(truncate=False)

                                                                                

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|22    |0      |1     |0.95811313|
|22    |3      |2     |1.6246264 |
|22    |5      |2     |2.0358346 |
|22    |6      |2     |2.2903247 |
|22    |9      |1     |1.5327815 |
|22    |10     |1     |1.4324794 |
|22    |11     |1     |1.2992125 |
|22    |13     |1     |1.5952772 |
|22    |14     |1     |1.3866265 |
|22    |16     |1     |0.70083123|
|22    |18     |3     |3.0301585 |
|22    |19     |1     |1.454614  |
|22    |22     |5     |4.1084185 |
|22    |25     |1     |0.9898241 |
|22    |26     |1     |1.1388094 |
|22    |29     |3     |3.2434077 |
|22    |30     |5     |3.9994707 |
|22    |32     |4     |3.1795506 |
|22    |33     |1     |0.91275346|
|22    |35     |1     |0.7472613 |
+------+-------+------+----------+
only showing top 20 rows



                                                                                

#### Evaluate model

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

[Stage 401:>                                                        (0 + 1) / 1]

Root-mean-square error (RMSE) = 0.5694858196672336


                                                                                

In [15]:
sc.stop()