# ALS

## Data preparation

In [7]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

In [5]:
filename = 'Book reviews/BX-Book-Ratings.csv'
seed = 12345

In [8]:
# Initialize Spark session
spark = SparkSession.builder.appName("ALSRecommender").getOrCreate()

# Load data from a CSV file, considering semicolon delimiter and quotes
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';', quote='"')

# Select and rename the columns according to the CSV file's format
ratings = data.select(
    col('User-ID').cast('int').alias('userId'), 
    col('ISBN').alias('bookId'), 
    col('Book-Rating').cast('int').alias('rating')
)


# Transform the ISBN string to an index using StringIndexer
stringIndexer = StringIndexer(inputCol="bookId", outputCol="bookIdIndexed")
model = stringIndexer.fit(ratings)
ratingsIndexed = model.transform(ratings)

# Split data into training and test sets
(training, test) = ratingsIndexed.randomSplit([0.8, 0.2], seed=seed)

                                                                                

## Modeling

In [10]:
als = ALS(userCol='userId', itemCol='bookIdIndexed', ratingCol='rating',
          coldStartStrategy='drop', nonnegative=True)


param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [1, 20, 30])\
             .addGrid(als.maxIter, [20])\
             .addGrid(als.regParam, [.05, .15])\
             .build()
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3)

model = cv.fit(training)

best_model = model.bestModel
print('rank: ', best_model.rank)
print('MaxIter: ', best_model._java_obj.parent().getMaxIter())
print('RegParam: ', best_model._java_obj.parent().getRegParam())

24/03/26 16:06:39 WARN CacheManager: Asked to cache already cached data.
24/03/26 16:06:39 WARN CacheManager: Asked to cache already cached data.
24/03/26 16:06:40 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/26 16:06:45 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/26 16:06:46 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:06:48 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:06:50 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:06:54 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:06:56 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:07:01 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:07:04 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:07:09 WARN DAGScheduler: Broadcasting large task binary wit

24/03/26 16:12:20 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:24 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:28 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:32 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:35 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:39 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:43 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:47 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:50 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:53 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:12:58 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:13:02 WARN DAGScheduler: Broadc

24/03/26 16:19:02 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:06 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:11 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:16 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:20 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:25 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:30 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:35 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:39 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:48 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:19:53 WARN DAGScheduler: Broadc

24/03/26 16:26:42 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:26:48 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:26:53 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:00 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:05 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:11 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:16 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:23 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:34 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:39 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:27:45 WARN DAGScheduler: Broadc

24/03/26 16:34:01 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/03/26 16:34:04 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/26 16:34:05 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:07 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:10 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:13 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:15 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:20 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:23 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:31 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:34:35 WARN DAGScheduler: Broadc

24/03/26 16:40:39 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:40:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:40:49 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:40:54 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:41:00 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:41:04 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:41:10 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:41:15 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:41:21 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:41:22 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:41:24 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/26 16:41:33 WARN DAGScheduler: Broadc

24/03/26 16:48:16 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:48:21 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:48:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:48:32 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:48:39 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:48:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:48:51 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:48:56 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:49:02 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:49:08 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:49:15 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:49:20 WARN DAGScheduler: Broadc

24/03/26 16:56:15 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:19 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:23 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:32 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:36 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:40 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:45 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:49 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:53 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:56:57 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 16:57:01 WARN DAGScheduler: Broadc

24/03/26 17:02:34 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:02:36 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:02:40 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:02:42 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:02:47 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:02:52 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:02:57 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:03:02 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:03:07 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:03:13 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:03:18 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:03:24 WARN DAGScheduler: Broadc

24/03/26 17:10:19 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:10:25 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:10:30 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:10:36 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:10:41 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:10:47 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:10:50 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:10:51 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/26 17:11:00 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
24/03/26 17:11:03 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/03/26 17:11:06 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/26 17:11:07 WARN DAGScheduler: Broadca

24/03/26 17:18:20 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:18:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:18:31 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:18:37 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:18:41 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:18:48 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:18:53 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:18:58 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:19:03 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:19:10 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:19:14 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 17:19:20 WARN DAGScheduler: Broadc

rank:  30
MaxIter:  20
RegParam:  0.15



                                                                                

rank:  30
MaxIter:  20
RegParam:  0.15

In [11]:
predictions = best_model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

24/03/26 18:15:42 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/26 18:15:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 18:15:46 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 18:16:00 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/03/26 18:16:04 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
[Stage 4580:>                                                       (0 + 4) / 4]

Root Mean Squared Error (RMSE): 3.86603614502443




                                                                                

Root Mean Squared Error (RMSE): 3.86603614502443