# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Aprendizaje Automático (Machine Learning): Recommendation Systems** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [None]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Recommender-Systems") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Preparación de Datos

In [None]:
from team_name.spark_utils import SparkUtils

# Sample user-song interaction data
data = [(1, 1, 4),
        (1, 2, 5),
        (1, 5, 5),
        (2, 2, 3),
        (2, 3, 4),
        (2, 4, 3),
        (3, 1, 2),
        (3, 3, 5),
        (3, 5, 1)]
  
# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("user_id", "integer"), ("song_id", "integer"), ("rating", "integer")])

# Create DataFrame for interactions
interactions_df = spark.createDataFrame(data, schema)

# Configure ALS model

In [None]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="user_id", 
    itemCol="song_id", 
    ratingCol="rating", 
    maxIter=10, 
    regParam=0.1, 
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)

# TRAINNING

In [None]:
model = als.fit(interactions_df)

# PREDICTIONS

In [None]:
# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=5)

# Show recommendations
user_recommendations.show(truncate=False)

## Song metadata

In [None]:
songs = [
    (1, "NOKIA"),
    (2, "luther"),
    (3, "Ordinary"),
    (4, "Die With A Smile"),
    (5, "APT")]

songs_schema = SparkUtils.generate_schema([("song_id", "integer"), ("title", "string")])
songs_df = spark.createDataFrame(songs, songs_schema)

In [None]:
from pyspark.sql.functions import explode

# Explode recommendations for easier reading
recommendations = user_recommendations.select("user_id", explode("recommendations").alias("rec"))
recommendations = recommendations.join(songs_df, recommendations.rec.song_id == songs_df.song_id).select("user_id", "title", "rec.rating")

# Show user-song recommendations with titles
recommendations.show(truncate=False)

## Predictions for all data

In [None]:
predictions = model.transform(interactions_df)
predictions.show(truncate=False)

# EVALUATE MODEL

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

In [None]:
sc.stop()