In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

In [2]:
filename = 'Book reviews/BX-Book-Ratings.csv'
seed = 12345

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("ALSRecommender").getOrCreate()

# Load data from a CSV file, considering semicolon delimiter and quotes
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';', quote='"')

# Select and rename the columns according to the CSV file's format
ratings = data.select(
    col('User-ID').cast('int').alias('userId'), 
    col('ISBN').alias('bookId'), 
    col('Book-Rating').cast('int').alias('rating')
)


# Transform the ISBN string to an index using StringIndexer
stringIndexer = StringIndexer(inputCol="bookId", outputCol="bookIdIndexed")
model = stringIndexer.fit(ratings)
ratingsIndexed = model.transform(ratings)

# Split data into training and test sets
(training, test) = ratingsIndexed.randomSplit([0.8, 0.2], seed=seed)

24/04/09 15:11:01 WARN Utils: Your hostname, Nazlis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.221 instead (on interface en0)
24/04/09 15:11:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/09 15:11:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [4]:
als = ALS(userCol='userId', itemCol='bookIdIndexed', ratingCol='rating',
          coldStartStrategy='drop', nonnegative=True)


param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [30, 40])\
             .addGrid(als.maxIter, [10, 20])\
             .addGrid(als.regParam, [.15, .2])\
             .build()
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3)

model = cv.fit(training)

best_model = model.bestModel
print('rank: ', best_model.rank)
print('MaxIter: ', best_model._java_obj.parent().getMaxIter())
print('RegParam: ', best_model._java_obj.parent().getRegParam())

24/04/09 15:11:22 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 15:11:28 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 15:11:29 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:31 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:33 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:37 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:38 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:42 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/04/09 15:11:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:49 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:11:53 WARN 

24/04/09 15:16:29 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:16:33 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:16:36 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:16:40 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:16:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:16:48 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:16:53 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:16:57 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:17:01 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:17:08 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:17:12 WARN DAGScheduler: Broadc

24/04/09 15:22:45 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:22:50 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:22:55 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:22:59 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:03 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:08 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:12 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:17 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:22 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:32 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:23:37 WARN DAGScheduler: Broadc

24/04/09 15:30:08 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:13 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:17 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:22 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:33 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:37 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:42 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:43 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:30:44 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 15:30:52 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/04/09 15:30:54 WARN DAGScheduler: Broadc

24/04/09 15:37:29 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 15:37:31 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:37:32 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:37:35 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 15:37:42 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:37:44 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/04/09 15:37:47 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 15:37:48 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:37:50 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:37:51 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:37:55 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:37:56 WARN DAGScheduler: Broadc

24/04/09 15:43:31 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:32 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:34 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:35 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:38 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:41 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:48 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:52 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:43:56 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:44:00 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:44:03 WARN DAGScheduler: Broadc

24/04/09 15:49:21 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:25 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:30 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:33 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:40 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:44 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:49 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:53 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:49:58 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:50:02 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:50:06 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:50:11 WARN DAGScheduler: Broadc

24/04/09 15:55:47 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:55:51 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:55:57 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:02 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:07 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:10 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:15 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:19 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:25 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:29 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:34 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 15:56:38 WARN DAGScheduler: Broadc

24/04/09 16:01:10 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:13 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:17 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:21 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:25 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:26 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:27 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:01:33 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:35 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/04/09 16:01:37 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:01:38 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:01:39 WARN DAGScheduler: Broadc

24/04/09 16:06:10 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:13 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:17 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:20 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:24 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:27 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:31 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:34 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:38 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:41 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:45 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:06:48 WARN DAGScheduler: Broadc

24/04/09 16:11:13 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:18 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:22 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:26 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:30 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:35 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:38 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:43 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:46 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:51 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:54 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:11:59 WARN DAGScheduler: Broadc

24/04/09 16:17:16 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:17:17 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 16:17:19 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:17:24 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/04/09 16:17:26 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/04/09 16:17:29 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:17:31 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:17:34 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:17:35 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:17:37 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:17:39 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 16:17:41 WARN DAGScheduler: Broadc

rank:  40
MaxIter:  10
RegParam:  0.2


                                                                                

old values:
rank:  30
MaxIter:  20
RegParam:  0.15

latest values:
rank:  40
MaxIter:  10
RegParam:  0.2

In [5]:
predictions = best_model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

24/04/09 17:22:02 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/04/09 17:22:03 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 17:22:04 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 17:22:12 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
24/04/09 17:22:14 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
[Stage 4560:>                                                       (0 + 4) / 4]

Root Mean Squared Error (RMSE): 3.7983094933611565




Root Mean Squared Error (RMSE): 3.86603614502443