<a href="https://colab.research.google.com/github/lukaszlewickii/spark-labs/blob/main/recommendation_system/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=9bec2d711276a2baccd48ecf2dea0b702479b7b0dcfb69fd5d9dff8317a88b28
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, isnan, when, count

In [3]:
spark = SparkSession.builder.appName("RecommenderSystem").getOrCreate()

In [5]:
data = spark.read.csv('./books_with_header.csv', header=True)

In [6]:
data.columns

['book_id',
 'goodreads_book_id',
 'best_book_id',
 'work_id',
 'books_count',
 'isbn',
 'isbn13',
 'authors',
 'original_publication_year',
 'original_title',
 'title',
 'language_code',
 'average_rating',
 'ratings_count',
 'work_ratings_count',
 'work_text_reviews_count',
 'ratings_1',
 'ratings_2',
 'ratings_3',
 'ratings_4',
 'ratings_5',
 'image_url',
 'small_image_url']

In [19]:
null_counts = data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns])
null_counts.show()

+-------+-----------------+------------+-------+-----------+----+------+-------+-------------------------+--------------+-----+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+---------+---------------+
|book_id|goodreads_book_id|best_book_id|work_id|books_count|isbn|isbn13|authors|original_publication_year|original_title|title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|image_url|small_image_url|
+-------+-----------------+------------+-------+-----------+----+------+-------+-------------------------+--------------+-----+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+---------+---------------+
|      0|                0|           0|      0|          0| 700|   585|      0|                       21|           585|    0|   

In [22]:
data = data.na.drop(subset=['average_rating'])

In [24]:
data.toPandas().shape

(10000, 23)

In [25]:
ratings = data.select("book_id", "average_rating")

In [26]:
ratings = ratings.withColumn("book_id", ratings["book_id"].cast("int"))
ratings = ratings.withColumn("average_rating", ratings["average_rating"].cast("float"))

In [27]:
ratings.show()

+-------+--------------+
|book_id|average_rating|
+-------+--------------+
|      1|          4.34|
|      2|          4.44|
|      3|          3.57|
|      4|          4.25|
|      5|          3.89|
|      6|          4.26|
|      7|          4.25|
|      8|          3.79|
|      9|          3.85|
|     10|          4.24|
|     11|          4.26|
|     12|          4.24|
|     13|          4.14|
|     14|          3.87|
|     15|           4.1|
|     16|          4.11|
|     17|           4.3|
|     18|          4.53|
|     19|          4.34|
|     20|          4.03|
+-------+--------------+
only showing top 20 rows



In [28]:
(train, test) = ratings.randomSplit([0.8, 0.2])

In [29]:
als = ALS(maxIter=5, regParam=0.01, userCol="book_id", itemCol="book_id", ratingCol="average_rating")
model = als.fit(train)

In [34]:
# Przygotowanie danych testowych dla ewaluacji
predictions = model.transform(test)
predictions.show()



+-------+--------------+----------+
|book_id|average_rating|prediction|
+-------+--------------+----------+
|      2|          4.44|       NaN|
|      5|          3.89|       NaN|
|      9|          3.85|       NaN|
|     14|          3.87|       NaN|
|     24|          4.53|       NaN|
|     33|          4.08|       NaN|
|     43|           4.1|       NaN|
|     48|          3.97|       NaN|
|     59|          4.15|       NaN|
|     62|          3.94|       NaN|
|     64|          4.06|       NaN|
|     66|          4.28|       NaN|
|     70|           4.3|       NaN|
|     81|          4.24|       NaN|
|     82|          3.94|       NaN|
|     94|          4.04|       NaN|
|    104|          3.95|       NaN|
|    106|          3.94|       NaN|
|    107|          4.15|       NaN|
|    111|          3.64|       NaN|
+-------+--------------+----------+
only showing top 20 rows



In [None]:
# Obliczenie błędu prognoz za pomocą RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="average_rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)