# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Machine Learning: Alternating Least Squares (ALS)** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ML: ALS") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/31 13:09:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Example 1: Songs recommednation

In [2]:
from pcamarillor.spark_utils import SparkUtils

# Sample user-song interaction data
data = [(1, 1, 4),
        (1, 2, 5),
        (1, 5, 5),
        (2, 2, 3),
        (2, 3, 4),
        (2, 4, 3),
        (3, 1, 2),
        (3, 3, 5),
        (3, 5, 1)]
  
# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("user_id", "int"), ("song_id", "int"), ("rating", "int")])

# Create DataFrame for interactions
interactions_df = spark.createDataFrame(data, schema)
interactions_df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+-------+------+
|user_id|song_id|rating|
+-------+-------+------+
|      1|      1|     4|
|      1|      2|     5|
|      1|      5|     5|
|      2|      2|     3|
|      2|      3|     4|
|      2|      4|     3|
|      3|      1|     2|
|      3|      3|     5|
|      3|      5|     1|
+-------+-------+------+



                                                                                

In [4]:
print(f"Number of items o canciones (n):{interactions_df.groupBy('song_id').count().count()}")
print(f"Number of users (m):{interactions_df.groupBy('user_id').count().count()}")

Number of items o canciones (n):5
Number of users (m):3


In [5]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="user_id", 
    itemCol="song_id", 
    ratingCol="rating", 
    maxIter=10, 
    regParam=0.1, 
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)

In [6]:
model = als.fit(interactions_df)
print("Recommendation system generated successfully")

Recommendation system generated successfully


In [10]:
# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=3)

# Show recommendations
user_recommendations.show(truncate=False)



+-------+-----------------------------------------------+
|user_id|recommendations                                |
+-------+-----------------------------------------------+
|1      |[{2, 4.947236}, {5, 4.8592587}, {1, 3.9400382}]|
|2      |[{3, 3.9432032}, {2, 2.96633}, {4, 2.9080243}] |
|3      |[{3, 4.831301}, {4, 3.1713889}, {2, 2.3089547}]|
+-------+-----------------------------------------------+



                                                                                

In [8]:
songs = [
    (1, "song a"),
    (2, "song b"),
    (3, "song c"),
    (4, "song d"),
    (5, "song e")]

songs_schema = SparkUtils.generate_schema([("song_id", "int"), ("title", "string")])
songs_df = spark.createDataFrame(songs, songs_schema)

In [9]:
from pyspark.sql.functions import explode

# Explode recommendations for easier reading
recommendations = user_recommendations.select("user_id", explode("recommendations").alias("rec"))
recommendations = recommendations.join(songs_df, recommendations.rec.song_id == songs_df.song_id).select("user_id", "title", "rec.rating")

# Show user-song recommendations with titles
recommendations.show(truncate=False)



+-------+------+---------+
|user_id|title |rating   |
+-------+------+---------+
|1      |song b|4.947236 |
|1      |song e|4.8592587|
|1      |song a|3.9400382|
|1      |song d|2.9627154|
|1      |song c|2.913545 |
|2      |song c|3.9432032|
|2      |song b|2.96633  |
|2      |song d|2.9080243|
|2      |song a|2.4423301|
|2      |song e|2.1428497|
|3      |song c|4.831301 |
|3      |song d|3.1713889|
|3      |song b|2.3089547|
|3      |song a|1.9651916|
|3      |song e|1.0496502|
+-------+------+---------+



                                                                                

In [11]:
predictions = model.transform(interactions_df)
predictions.show(truncate=False)

+-------+-------+------+----------+
|user_id|song_id|rating|prediction|
+-------+-------+------+----------+
|1      |1      |4     |3.9400382 |
|1      |2      |5     |4.947236  |
|1      |5      |5     |4.8592587 |
|2      |2      |3     |2.96633   |
|3      |1      |2     |1.9651916 |
|3      |3      |5     |4.831301  |
|3      |5      |1     |1.0496502 |
|2      |3      |4     |3.9432032 |
|2      |4      |3     |2.9080243 |
+-------+-------+------+----------+



In [12]:
# Evaluate the Recommendation System
from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

Root-mean-square error (RMSE) = 0.08890862503498724


# Example 2: Movies

In [None]:
movies_ratings_path = "/opt/spark/work-dir/data/ml/als"

movies_ratings_schema = SparkUtils.generate_schema([("user_id", "int"), ("movie_id", "int"), ("rating", "int"),("timestamp", "int")])

# Source https://github.com/databricks/Spark-The-Definitive-Guide/blob/master/data/sample_movielens_ratings.txt
movies_ratings_df = spark.read \
                    .option("header", "false") \
                    .option("delimiter", "::") \
                    .schema(movies_ratings_schema) \
                    .csv(movies_ratings_path)

movies_ratings_df.printSchema()
movies_ratings_df.show(n=3)

root
 |-- user_id: integer (nullable = true)
 |-- movie_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)

+-------+--------+------+----------+
|user_id|movie_id|rating| timestamp|
+-------+--------+------+----------+
|      0|       2|     3|1424380312|
|      0|       3|     1|1424380312|
|      0|       5|     2|1424380312|
+-------+--------+------+----------+
only showing top 3 rows


In [17]:
print(f"Number of items o movies (n):{movies_ratings_df.groupBy('movie_id').count().count()}")
print(f"Number of users (m):{movies_ratings_df.groupBy('user_id').count().count()}")

Number of items o movies (n):100
Number of users (m):30


## Create & Train the ML Model

In [27]:
# Configure ALS model
als = ALS(
    userCol="user_id", 
    itemCol="movie_id",
    ratingCol="rating", 
    maxIter=25,
    regParam=0.05,
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)
# Train the model (THIS STEP MAY TAKE SOME TIME)
als_model = als.fit(movies_ratings_df)

## Persist the model

In [None]:
als_model_path = "/opt/spark/work-dir/data/mlmodels/als/als_movies"
als_model.write().overwrite().save(als_model_path)

## Predictions

In [19]:
from pyspark.ml.recommendation import ALSModel
# Load saved model
#als_model = ALSModel.load(als_model_path)

# Generate the  top 5 recommendations for each user

user_recommendations = als_model.recommendForAllUsers(numItems=5)
# Show recommendations
user_recommendations.show(truncate=False)



+-------+-------------------------------------------------------------------------------------+
|user_id|recommendations                                                                      |
+-------+-------------------------------------------------------------------------------------+
|0      |[{92, 2.567516}, {2, 2.2980113}, {62, 2.2533116}, {93, 2.18572}, {25, 2.1733978}]    |
|10     |[{92, 2.7359204}, {93, 2.6829367}, {2, 2.6401443}, {25, 2.5964687}, {49, 2.5660012}] |
|20     |[{22, 3.577638}, {68, 3.1335194}, {51, 3.096676}, {94, 3.0710056}, {77, 3.0283942}]  |
|1      |[{22, 2.9241138}, {68, 2.6358395}, {77, 2.5409534}, {90, 2.515977}, {62, 2.4985847}] |
|11     |[{32, 5.098444}, {18, 4.7203503}, {30, 4.640781}, {27, 4.482382}, {79, 4.144085}]    |
|21     |[{29, 4.340982}, {52, 4.2697444}, {76, 3.7713296}, {63, 3.5152457}, {53, 3.477858}]  |
|22     |[{51, 4.4850426}, {75, 4.4336934}, {22, 4.1221323}, {74, 4.1156454}, {88, 4.0854187}]|
|2      |[{93, 4.2608924}, {83, 4.136333

                                                                                

## Test ML Model

In [28]:
from pyspark.ml.evaluation import RegressionEvaluator

# Generate predictions for all users
predictions = als_model.transform(movies_ratings_df)


# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}") 

Root-mean-square error (RMSE) = 0.5242583719702174


In [29]:
sc.stop()