In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer

In [9]:
def als_recommender(filename, seed):
    '''
    This function prints the RMSE of recommendations obtained
    through ALS collaborative filtering using 'BX-Book-Ratings.csv' dataset.
    The training ratio is 80% and the test ratio is 20%. The
    random seed is used for both sampling and ALS optimizer initialization.
    '''

    # Initialize Spark session
    spark = SparkSession.builder.appName("ALSRecommender").getOrCreate()

    # Load data from a CSV file, considering semicolon delimiter and quotes
    data = spark.read.csv(filename, header=True, inferSchema=True, sep=';', quote='"')

    # Select and rename the columns according to the CSV file's format
    ratings = data.select(
        col('User-ID').cast('int').alias('userId'), 
        col('ISBN').alias('bookId'), 
        col('Book-Rating').cast('int').alias('rating')
    )

    # Transform the ISBN string to an index using StringIndexer
    stringIndexer = StringIndexer(inputCol="bookId", outputCol="bookIdIndexed")
    model = stringIndexer.fit(ratings)
    ratingsIndexed = model.transform(ratings)

    # Split data into training and test sets
    (training, test) = ratingsIndexed.randomSplit([0.8, 0.2], seed=seed)

    # Build the recommendation model using ALS on the training data
    als = ALS(maxIter=5, rank=70, regParam=0.01, userCol="userId", 
              itemCol="bookIdIndexed", ratingCol="rating", coldStartStrategy="drop", 
              seed=seed)
    model = als.fit(training)

    # Make predictions on the test data
    predictions = model.transform(test)

    # Evaluate the model by computing the RMSE on the test data
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", 
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Stop the Spark session
    spark.stop()

    return rmse



In [10]:
filename = 'Book reviews/BX-Book-Ratings.csv'
seed = 12345
rmse = als_recommender(filename, seed)

24/03/21 16:00:51 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:00:53 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:00:57 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:01:00 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:01:03 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:01:05 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:01:09 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:01:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/03/21 16:01:12 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB
24/03/21 16:01:13 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/03/21 16:01:28 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB

Root-mean-square error = 4.5155765942672295
