In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.sql.functions import col

# I had to define the executor and driver memory because otherwise my kernel would get terminated

spark = SparkSession.builder \
    .appName("ALSExample") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()


24/06/01 12:24:26 WARN Utils: Your hostname, MacBook-Pro-van-D.local resolves to a loopback address: 127.0.0.1; using 192.168.178.171 instead (on interface en0)
24/06/01 12:24:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/01 12:24:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Reading the file
df = spark.read.format("csv")\
    .option("header", True)\
    .option("inferSchema", True)\
    .load("/Users/d.c.deh./Documents/Visual Studio/Data science 2/Csv files/ratings_Beauty.csv")

In [3]:
df.show(5)

+--------------+----------+------+----------+
|        UserId| ProductId|Rating| Timestamp|
+--------------+----------+------+----------+
|A39HTATAQ9V7YF|0205616461|   5.0|1369699200|
|A3JM6GV9MNOF9X|0558925278|   3.0|1355443200|
|A1Z513UWSAAO0F|0558925278|   5.0|1404691200|
|A1WMRR494NWEWV|0733001998|   4.0|1382572800|
|A3IAAVS479H7M7|0737104473|   1.0|1274227200|
+--------------+----------+------+----------+
only showing top 5 rows



In [4]:
df.printSchema()

root
 |-- UserId: string (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Timestamp: integer (nullable = true)



In [5]:
from pyspark.ml.feature import StringIndexer

# Starting with converting the strins to numbers
indexer = StringIndexer(inputCol = "UserId", outputCol = "UserId_numeric")

# Fitting and transforming the DataFrame
df = indexer.fit(df).transform(df)

                                                                                

In [6]:
indexer_2 = StringIndexer(inputCol = "ProductId", outputCol = "ProductId_numeric")

# Fitting and transforming the DataFrame
df = indexer_2.fit(df).transform(df)

In [7]:
df.show(5)

24/06/01 12:24:35 WARN DAGScheduler: Broadcasting large task binary with size 50.8 MiB
[Stage 9:>                                                          (0 + 1) / 1]

+--------------+----------+------+----------+--------------+-----------------+
|        UserId| ProductId|Rating| Timestamp|UserId_numeric|ProductId_numeric|
+--------------+----------+------+----------+--------------+-----------------+
|A39HTATAQ9V7YF|0205616461|   5.0|1369699200|       70392.0|         145790.0|
|A3JM6GV9MNOF9X|0558925278|   3.0|1355443200|      265306.0|         103581.0|
|A1Z513UWSAAO0F|0558925278|   5.0|1404691200|      552933.0|         103581.0|
|A1WMRR494NWEWV|0733001998|   4.0|1382572800|      536779.0|         145791.0|
|A3IAAVS479H7M7|0737104473|   1.0|1274227200|       14679.0|         145792.0|
+--------------+----------+------+----------+--------------+-----------------+
only showing top 5 rows



                                                                                

In [8]:
df.select("ProductId_numeric").count()

2023070

# Creating the model

In [9]:
from pyspark.ml.recommendation import ALS

# Defining the model using the coldStartStrategy because otherwise I would get a lot of null values
als = ALS(maxIter = 5, \
          regParam = 0.01, \
          userCol = "UserId_numeric", \
          itemCol = "ProductId_numeric", 
          ratingCol = "Rating", \
          coldStartStrategy="drop"\
          )

In [10]:
train, test = df.randomSplit([0.7, 0.3], seed = 42)

In [11]:
# Here it shows quite a few warnings, this is also the point where my kernel got terminated multiple times
model = als.fit(train)

24/06/01 12:24:38 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:24:39 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/06/01 12:24:40 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:24:42 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:24:45 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:24:48 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:24:50 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:24:52 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:24:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/01 12:24:

In [12]:
predictions = model.transform(test)
predictions.show(5)

24/06/01 12:25:17 WARN DAGScheduler: Broadcasting large task binary with size 50.8 MiB
24/06/01 12:25:18 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:20 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:24 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:28 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB


+--------------+----------+------+----------+--------------+-----------------+----------+
|        UserId| ProductId|Rating| Timestamp|UserId_numeric|ProductId_numeric|prediction|
+--------------+----------+------+----------+--------------+-----------------+----------+
|A15FXTZ9PODUGO|B000UVZU1S|   5.0|1346544000|        1561.0|             28.0|  4.837264|
|A2DVCR7ITSBDUR|B000UVZU1S|   2.0|1375315200|        1752.0|             28.0| 1.5882462|
|A2WUEH29HQWJUP|B000UVZU1S|   5.0|1381190400|        1772.0|             28.0|  2.237219|
| A481GH230UZTR|B000UVZU1S|   4.0|1368316800|        2235.0|             28.0|0.47542918|
|A2SIWGIOBLG6YV|B000UVZU1S|   5.0|1374019200|        3346.0|             28.0| 3.7504473|
+--------------+----------+------+----------+--------------+-----------------+----------+
only showing top 5 rows



                                                                                

In [13]:
# Some predictions go beyond the range of 1.0 and 5.0, so here I defined a function that gets every value within this range
within_range_predictions = predictions.withColumn("prediction",\
    when(col("prediction") < 0, 0.0)\
    .when(col("prediction") > 5, 5.0)\
    .otherwise(col("prediction")))

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName = "rmse",\
                                labelCol = "Rating", \
                                predictionCol = "prediction"\
                                )

In [15]:
rmse = evaluator.evaluate(within_range_predictions)
print("RMSE:", rmse)

24/06/01 12:25:29 WARN DAGScheduler: Broadcasting large task binary with size 50.8 MiB
24/06/01 12:25:31 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:33 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:37 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:41 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:43 WARN DAGScheduler: Broadcasting large task binary with size 51.0 MiB
[Stage 147:>                                                        (0 + 2) / 2]

RMSE: 3.684660644167319


                                                                                

This is quite bad tbh for a model within the 0.0 and 5.0 range, but I cannot get it better than this. I also think that this dataset is not amazing to work with, simply because of the amount of preprocessing steps you have to take.

# Cross validation

In [16]:
from pyspark.ml.tuning import ParamGridBuilder

# Making a parameter grid for the cross validation, more variables and values are impossible to run for my computer
param_grid = ParamGridBuilder()\
    .addGrid(als.rank, [10])\
    .addGrid(als.maxIter, [5])\
    .addGrid(als.regParam, [0.01, 0.05])\
    .build()

In [17]:
from pyspark.ml.tuning import CrossValidator

# Defining the evaluator again, together with the cross validation
evaluator = RegressionEvaluator(metricName = "rmse",\
                               labelCol = "Rating",\
                               predictionCol = "prediction")

cv = CrossValidator(estimator = als,
                   estimatorParamMaps = param_grid,
                   evaluator = evaluator,
                   numFolds = 5)

print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  2


In [18]:
# Running the cross validation; I don't advice running it yourself, the length of the output list speaks for itself.
modelcv = cv.fit(train)

24/06/01 12:25:44 WARN DAGScheduler: Broadcasting large task binary with size 50.8 MiB
24/06/01 12:25:48 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:50 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:52 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:55 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:57 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:25:59 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:26:01 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:26:03 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:26:05 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:26:06 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:26:09 WARN DAGScheduler: Broadc

In [19]:
best_model = modelcv.bestModel
test_predictions = best_model.transform(test)

In [20]:
within_range_test_predictions = test_predictions.withColumn("prediction",\
    when(col("prediction") < 0, 0.0)\
    .when(col("prediction") > 5, 5.0)\
    .otherwise(col("prediction")))

In [21]:
rmse = evaluator.evaluate(within_range_test_predictions)
print(rmse)

24/06/01 12:35:03 WARN DAGScheduler: Broadcasting large task binary with size 50.8 MiB
24/06/01 12:35:05 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:35:06 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:35:11 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:35:16 WARN DAGScheduler: Broadcasting large task binary with size 50.9 MiB
24/06/01 12:35:18 WARN DAGScheduler: Broadcasting large task binary with size 51.0 MiB
[Stage 1092:>                                                       (0 + 2) / 2]

3.656743924099279


                                                                                