# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Lab12 Grupo Foraneos - Ejemplo de Sistema de Recomendacion con ALS** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [84]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [85]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Movie-Recommender-Systems") \
    .master("spark://0638c7435d1d:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Preparación de Datos

In [86]:
from team_name.spark_utils import SparkUtils

 
# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("user_id", "integer"), ("movie_id", "integer"), ("rating", "integer"), ("timestamp", "integer")])


In [87]:
# Read DataFrame
df = spark \
                .read \
                .schema(schema) \
                .option("delimiter", "::") \
                .option("header", "true") \
                .option("mode", "dropMalformed")\
                .csv("/home/jovyan/notebooks/data/movie_recommendations/sample_movielens_ratings.csv")
                

In [88]:
#remove timestamp column
movie_rating_ds = df.select("user_id", "movie_id", "rating")

# Configure ALS model

In [89]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="user_id", 
    itemCol="movie_id", 
    ratingCol="rating", 
    maxIter=10, 
    regParam=0.01,       #
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items. (but here we have only 3)
    coldStartStrategy="drop"  # Avoids NaN predictions
)

# TRAINNING

#### all data is used for ttraining since its no trainng in common sense but we just learn patterns

In [90]:
model = als.fit(movie_rating_ds)

                                                                                

# PREDICTIONS  - 5 Recommendations for all users

In [91]:
# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=5)

# Show recommendations
#user_recommendations.show(n=10, truncate=False)

# shows for every user the song id and the respective predicted rating

In [92]:
from pyspark.sql.functions import explode, col

# Explode recommendations for easier reading
recommendations = user_recommendations.select("user_id", explode("recommendations").alias("rec")) \
    .select("user_id", "rec.movie_id", "rec.rating") \
    .orderBy(col("user_id").asc())

# Show user-song recommendations with titles
recommendations.show(n=10,truncate=False)



+-------+--------+---------+
|user_id|movie_id|rating   |
+-------+--------+---------+
|0      |92      |2.756482 |
|0      |93      |2.4707947|
|0      |2       |2.4706147|
|0      |41      |2.1942089|
|0      |62      |2.1325884|
|1      |77      |2.9480083|
|1      |28      |2.804208 |
|1      |22      |3.2507768|
|1      |98      |2.701512 |
|1      |68      |3.087799 |
+-------+--------+---------+
only showing top 10 rows



                                                                                

## Predictions for all data

In [93]:
predictions = model.transform(movie_rating_ds)
predictions.show(n=10,truncate=False)

[Stage 103:>                                                        (0 + 1) / 1]

+-------+--------+------+----------+
|user_id|movie_id|rating|prediction|
+-------+--------+------+----------+
|22     |0       |1     |0.95919204|
|22     |3       |2     |1.8303176 |
|22     |5       |2     |1.9907775 |
|22     |6       |2     |2.4566476 |
|22     |9       |1     |1.7386383 |
|22     |10      |1     |1.5379798 |
|22     |11      |1     |1.0877988 |
|22     |13      |1     |1.6719738 |
|22     |14      |1     |1.398817  |
|22     |16      |1     |0.66279703|
+-------+--------+------+----------+
only showing top 10 rows



                                                                                

# EVALUATE MODEL

In [94]:
## compares the actual rratings for listened songs of each user
## with the predicted rating that gave the model for this user and this song

from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE - should be low
rmse = evaluator.evaluate(predictions)
print(f"\n Root-mean-square error (RMSE) = {rmse}")


 Root-mean-square error (RMSE) = 0.5173242623560674


                                                                                

In [95]:
sc.stop()