In [26]:
import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql.functions import col

In [2]:
import numpy as np

In [3]:
from IPython.display import display, Image

spark = ps.sql.SparkSession.builder \
            .master("local") \
            .appName("book_recommendation") \
            .getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [4]:
ratings_df = spark.read.csv('goodbooks-10k/ratings.csv', header=True, inferSchema=True)
ratings_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [5]:
ratings_df.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [6]:
books_df = spark.read.csv('goodbooks-10k/books.csv', header=True, inferSchema=True)
books_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- best_book_id: integer (nullable = true)
 |-- work_id: integer (nullable = true)
 |-- books_count: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: double (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: double (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: double (nullable = true)
 |-- ratings_2: integer (nullable = true)
 |-- ratings_3: integer (nullable = true)
 |-- ratings_4: integer (nullable = true)
 |-- ratings_5: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (nullable = true)


In [7]:
books_df.show(5)

+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
| id|book_id|best_book_id|work_id|books_count|     isbn|          isbn13|             authors|original_publication_year|      original_title|               title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|           image_url|     small_image_url|
+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------

In [8]:
traning_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [9]:
iterations = 10
regularization_parameter = 0.1
ranks = 5
errors = []
err = 0

In [10]:
als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=ranks, userCol="user_id", itemCol="book_id", ratingCol="rating")
model = als.fit(traning_df)
predictions = model.transform(test_df)
new_predictions = predictions.filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(new_predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9030160199185534


In [11]:
predictions = model.transform(test_df)
predictions.show(5)

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    148|  26629|     4|  3.852192|
|    148|  28767|     3| 3.1634703|
|    148|  41282|     3|  4.050268|
|    148|  13034|     4| 3.4521158|
|    148|  32055|     3|  3.137745|
+-------+-------+------+----------+
only showing top 5 rows



In [12]:
predictions.join(books_df, "book_id").select("user_id", "title", "prediction").show(5)

+-------+----------+----------+
|user_id|     title|prediction|
+-------+----------+----------+
|  12014|Lysistrata| 3.9371243|
|   6213|Lysistrata| 2.8817964|
|   6460|Lysistrata|  4.122798|
|  40418|Lysistrata|  3.372207|
|  37449|Lysistrata| 4.8577294|
+-------+----------+----------+
only showing top 5 rows



In [13]:
for_one_user = predictions.filter(col('user_id') == 35982).join(books_df, "book_id").select("user_id", "title", "image_url", "prediction")
for_one_user.count()

3

In [14]:
for_one_user.show()

+-------+--------------------+--------------------+----------+
|user_id|               title|           image_url|prediction|
+-------+--------------------+--------------------+----------+
|  35982|The Innocent Man:...|https://s.gr-asse...|  3.324088|
|  35982|Another Bullshit ...|https://s.gr-asse...| 3.5430372|
|  35982|The Portrait of a...|https://images.gr...|  3.643324|
+-------+--------------------+--------------------+----------+



In [15]:
for book in for_one_user.collect():
    print(book.title)
    display(Image(url=book.image_url))
    print(book.prediction)

The Innocent Man: Murder and Injustice in a Small Town


3.3240880966186523
Another Bullshit Night in Suck City


3.543037176132202
The Portrait of a Lady


3.6433238983154297


In [16]:
userRecs = model.recommendForAllUsers(5)
bookRecs = model.recommendForAllItems(5)

In [17]:
userRecs.select("user_id", "recommendations.book_id").show(10, False)

+-------+------------------------------+
|user_id|book_id                       |
+-------+------------------------------+
|148    |[862, 6920, 1788, 7401, 6590] |
|463    |[3124, 192, 5344, 3746, 9110] |
|471    |[5084, 3660, 3086, 4344, 8973]|
|496    |[4874, 6591, 9531, 5294, 3628]|
|833    |[9569, 3615, 8976, 4778, 7762]|
|1088   |[5084, 9537, 5400, 8973, 2209]|
|1238   |[8013, 7947, 1338, 2236, 4868]|
|1342   |[6920, 5207, 9566, 6590, 7254]|
|1580   |[7254, 5207, 6590, 9531, 5580]|
|1591   |[3885, 9842, 6928, 8926, 4594]|
+-------+------------------------------+
only showing top 10 rows



In [18]:
bookRecs.select("book_id", "recommendations.user_id").show(10, False)

+-------+-----------------------------------+
|book_id|user_id                            |
+-------+-----------------------------------+
|1580   |[26131, 49594, 50333, 46973, 27969]|
|4900   |[11963, 504, 17623, 23475, 33032]  |
|5300   |[46594, 39323, 37327, 3054, 3673]  |
|6620   |[49001, 49594, 26131, 30441, 38202]|
|7240   |[26131, 504, 49360, 52461, 26305]  |
|7340   |[26131, 26305, 49594, 27807, 20992]|
|7880   |[34547, 36647, 26131, 30757, 17623]|
|9900   |[48949, 32700, 17623, 13145, 47997]|
|471    |[50333, 34886, 26131, 23717, 37507]|
|1591   |[49594, 50333, 37327, 49897, 50307]|
+-------+-----------------------------------+
only showing top 10 rows



In [23]:
model.save("book_recommendation.parquet")

In [27]:
model = ALSModel.load("book_recommendation_model")