# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Machine Learning: Alternating Least Squares (ALS)** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ML: ALS") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Example 1: Songs recommednation

In [None]:
from pcamarillor.spark_utils import SparkUtils

# Sample user-song interaction data
data = [(1, 1, 4),
        (1, 2, 5),
        (1, 5, 5),
        (2, 2, 3),
        (2, 3, 4),
        (2, 4, 3),
        (3, 1, 2),
        (3, 3, 5),
        (3, 5, 1)]
  
# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("user_id", "int"), ("song_id", "int"), ("rating", "int")])

# Create DataFrame for interactions
interactions_df = spark.createDataFrame(data, schema)

In [None]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="user_id", 
    itemCol="song_id", 
    ratingCol="rating", 
    maxIter=10, 
    regParam=0.1, 
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)

In [None]:
model = als.fit(interactions_df)
print("Recommendation system generated successfully")

In [None]:
# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=5)

# Show recommendations
user_recommendations.show(truncate=False)

In [None]:
songs = [
    (1, "song a"),
    (2, "song b"),
    (3, "song c"),
    (4, "song d"),
    (5, "song e")]

songs_schema = SparkUtils.generate_schema([("song_id", "int"), ("title", "string")])
songs_df = spark.createDataFrame(songs, songs_schema)

In [None]:
from pyspark.sql.functions import explode

# Explode recommendations for easier reading
recommendations = user_recommendations.select("user_id", explode("recommendations").alias("rec"))
recommendations = recommendations.join(songs_df, recommendations.rec.song_id == songs_df.song_id).select("user_id", "title", "rec.rating")

# Show user-song recommendations with titles
recommendations.show(truncate=False)

In [None]:
predictions = model.transform(interactions_df)
predictions.show(truncate=False)

In [None]:
# Evaluate the Recommendation System
from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

# Example 2: Movies

In [None]:
movies_ratings_path = "/opt/spark/work-dir/data/ml/als"

movies_ratings_schema = SparkUtils.generate_schema([("user_id", "int"), ("movie_id", "int"), ("rating", "int"),("timestamp", "int")])

# Source https://github.com/databricks/Spark-The-Definitive-Guide/blob/master/data/sample_movielens_ratings.txt
movies_ratings_df = spark.read \
                    .option("header", "false") \
                    .option("delimiter", "::") \
                    .schema(movies_ratings_schema) \
                    .csv(movies_ratings_path)

movies_ratings_df.printSchema()

## Create & Train the ML Model

In [None]:
# Configure ALS model
als = ALS(
    userCol="user_id", 
    itemCol="movie_id",
    ratingCol="rating", 
    maxIter=10,
    regParam=0.1,
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)
# Train the model (THIS STEP MAY TAKE SOME TIME)
als_model = als.fit(movies_ratings_df)

## Persist the model

In [None]:
als_model_path = "/opt/spark/work-dir/data/mlmodels/als/als_movies"
als_model.write().overwrite().save(als_model_path)

## Predictions

In [None]:
from pyspark.ml.recommendation import ALSModel
# Load saved model
als_model = ALSModel.load(als_model_path)

# Generate the  top 5 recommendations for each user

user_recommendations = als_model.recommendForAllUsers(numItems=5)
# Show recommendations
user_recommendations.show(truncate=False)

## Test ML Model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Generate predictions for all users
predictions = als_model.transform(movies_ratings_df)


# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}") 

In [None]:
sc.stop()