In [1]:
import pandas as pd
import numpy as np

# ! pip install pyspark
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import explode, col, round, abs
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession \
    .builder \
    .appName("Book Rec System") \
    .getOrCreate()

21/12/10 12:48:51 WARN Utils: Your hostname, cliodhna-Lenovo-ideapad-530S-14IKB resolves to a loopback address: 127.0.1.1; using 192.168.0.37 instead (on interface wlp1s0)
21/12/10 12:48:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/10 12:48:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Reading Datasets

In [3]:
interactions = spark.read.format("csv").option("header","true").load("data/interactions.csv")

                                                                                

In [4]:
interactions = interactions.withColumn("user_id",interactions["user_id"].cast("int"))
interactions = interactions.withColumn("rating",interactions["rating"].cast("int"))
interactions = interactions.withColumn("book_id",interactions["book_id"].cast("int"))
# interactions = interactions.dropDuplicates(["book_id"])

In [5]:
interactions.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- is_read: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- is_reviewed: string (nullable = true)
 |-- user_count: string (nullable = true)



In [6]:
interactions.show(n=5)

+-------+-------+-------+------+-----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|user_count|
+-------+-------+-------+------+-----------+----------+
|      0|    915|      1|     5|        1.0|        15|
|      0|    873|      1|     4|        0.0|        15|
|      0|    871|      1|     2|        0.0|        15|
|      0|    870|      1|     3|        0.0|        15|
|      0|    824|      1|     5|        1.0|        15|
+-------+-------+-------+------+-----------+----------+
only showing top 5 rows



In [35]:
books = spark.read.csv("data/books.csv", sep='|', header=True)

In [37]:
cols = ("isbn","is_ebook","kindle_asin", "country_code", "language_code", "asin","description", "format", "link", "publication_day", "isbn13",
        "publication_month", "edition_information", "url", "image_url", "work_id", "text_reviews_count", "title_without_series")

books = books.drop(*cols)
books.show(5)
books = books.select("book_id","title","genre","authors","publisher", "average_rating", "publication_year", "popular_shelves", "similar_books", "ratings_count")
books = books.dropDuplicates(["book_id"])

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,genre|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

AnalysisException: cannot resolve 'book_id' given input columns: [isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,genre];
'Project ['book_id, 'title, 'genre, 'authors, 'publisher, 'average_rating, 'publication_year, 'popular_shelves, 'similar_books, 'ratings_count]
+- Relation [isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,genre#2826] csv


In [32]:
books.show(4)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|             book_id|               title|               genre|             authors|           publisher|average_rating|    publication_year|     popular_shelves|       similar_books|       ratings_count|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
| "" but his image...|[{'author_id': '1...|               784.0|   wild anachronisms| hallucinatory ha...|          4.12|              photos|[{'count': '522',...|['855422', '84351...|           Paperback|
|        Edwin Torres|                null|                null|          Bob Holman| Christian X. Hunter|    Kathy Ebel|        Hal Sirowitz|          Todd Colby|      Janice 

# Training ALS with Cross Validation

In [10]:
(training, test) = interactions.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)
model = als.fit(training)

                                                                                

In [11]:
param_grid = ParamGridBuilder() \
 .addGrid(als.rank, [10, 50, 75, 100]) \
 .addGrid(als.maxIter, [5, 50, 75, 100]) \
 .addGrid(als.regParam, [.01, .05, .1, .15]) \
 .build()

evaluator = RegressionEvaluator(metricName = "rmse", 
 labelCol = "rating", 
 predictionCol = "prediction")

print ("Num models to be tested using param_grid: ", len(param_grid))

Num models to be tested using param_grid:  64


In [12]:
cv = CrossValidator(estimator = als, 
 estimatorParamMaps = param_grid, 
 evaluator = evaluator, 
 numFolds = 5)

model = als.fit(training)
predictions = model.transform(test)

predictions.show(n = 10)

[Stage 74:>                                                         (0 + 1) / 1]                                                                                

+-------+-------+-------+------+-----------+----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|user_count|prediction|
+-------+-------+-------+------+-----------+----------+----------+
|    471|  14707|      1|     4|        0.0|        19| 7.1284957|
|    471|  24052|      1|     4|        0.0|        19| 5.3965516|
|    471| 104078|      1|     3|        0.0|        19| 0.8928766|
|    471| 104090|      1|     3|        1.0|        19| 1.0758405|
|    496|   1372|      1|     5|        1.0|        11| 3.6432853|
|    496|   7992|      1|     2|        0.0|        11| 2.0183487|
|    496|  60220|      1|     4|        0.0|        11|  3.104753|
|    833|   1434|      1|     1|        0.0|         7| 3.8748412|
|    833|  60834|      1|     4|        0.0|         7| 4.3166313|
|   1238|   6572|      1|     5|        1.0|        26|  5.245753|
+-------+-------+-------+------+-----------+----------+----------+
only showing top 10 rows



In [13]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

[Stage 143:>                                                        (0 + 1) / 1]

Root-mean-square error = 1.605278890678653


                                                                                

In [14]:
predictions = predictions.withColumn("prediction",predictions["prediction"].cast("double"))

eval_accuracy = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="precisionByLabel")
eval_recall = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="recallByLabel")
eval_f1 = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="f1")

accuracy = eval_accuracy.evaluate(predictions)
# precision = eval_precision.evaluate(predictions)
# recall = eval_recall.evaluate(predictions)
# f1score = eval_f1.evaluate(predictions)
accuracy

                                                                                

0.0

# Rounding Predictions

In [15]:
predictions = predictions.withColumn("rounded_prediction", round(col("prediction"), 0))
predictions = predictions.withColumn("rounded_prediction",predictions["rounded_prediction"].cast("int"))


In [16]:
new_pred = predictions.drop(col("prediction"))
new_pred = new_pred.withColumnRenamed("rounded_prediction", "prediction")
new_pred.show(n=10)

+-------+-------+-------+------+-----------+----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|user_count|prediction|
+-------+-------+-------+------+-----------+----------+----------+
|    471|  14707|      1|     4|        0.0|        19|         7|
|    471|  24052|      1|     4|        0.0|        19|         5|
|    471| 104078|      1|     3|        0.0|        19|         1|
|    471| 104090|      1|     3|        1.0|        19|         1|
|    496|   1372|      1|     5|        1.0|        11|         4|
|    496|   7992|      1|     2|        0.0|        11|         2|
|    496|  60220|      1|     4|        0.0|        11|         3|
|    833|   1434|      1|     1|        0.0|         7|         4|
|    833|  60834|      1|     4|        0.0|         7|         4|
|   1238|   6572|      1|     5|        1.0|        26|         5|
+-------+-------+-------+------+-----------+----------+----------+
only showing top 10 rows



In [17]:
new_pred = new_pred.withColumn("prediction",new_pred["prediction"].cast("double"))

eval_accuracy = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="precisionByLabel")
eval_recall = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="recallByLabel")
eval_f1 = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="f1")

accuracy = eval_accuracy.evaluate(new_pred)
# precision = eval_precision.evaluate(new_pred)
# recall = eval_recall.evaluate(new_pred)
# f1score = eval_f1.evaluate(new_pred)
accuracy

0.3016881163296005

# Recommendations

In [18]:
userRecs = model.recommendForAllUsers(10)
bookRecs = model.recommendForAllItems(5)



In [19]:
bookRecs.show(5, False)



+-------+--------------------------------------------------------------------------------------------------+
|book_id|recommendations                                                                                   |
+-------+--------------------------------------------------------------------------------------------------+
|236    |[{7115, 7.9971437}, {137, 7.3445115}, {16429, 6.667948}, {341, 6.522622}, {128, 6.5151815}]       |
|1068   |[{10344, 7.718821}, {552, 7.561115}, {1813, 7.0963063}, {9178, 7.01012}, {971, 6.8828287}]        |
|1226   |[{3132, 7.3775134}, {6814, 6.982759}, {4796, 6.948283}, {1269, 6.5885906}, {15352, 6.291789}]     |
|1265   |[{3132, 9.6307}, {4796, 8.80101}, {16863, 8.458717}, {15703, 8.347777}, {10462, 7.99389}]         |
|1363   |[{552, 12.575844}, {16863, 11.4340725}, {10683, 11.297866}, {1292, 11.236174}, {16720, 11.196016}]|
+-------+--------------------------------------------------------------------------------------------------+
only showing top 5 

                                                                                

In [20]:
userRecs.show(5, False)



+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                                |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|26     |[{553387, 14.715152}, {816582, 14.715152}, {816581, 14.715152}, {116511, 14.715152}, {864757, 13.87485}, {864758, 13.87485}, {258334, 13.384239}, {144350, 12.89467}, {377891, 12.761853}, {43488, 12.75015}]  |
|28     |[{15245, 7.530599}, {397183, 7.4533806}, {229575, 7.4522176}, {43536, 7.3541775}, {43747, 7.241123}, {116511, 7.131547}

                                                                                

In [21]:
top_users = interactions.groupBy(interactions['user_id']).agg({'rating':"count"}).sort("count(rating)", ascending=False).dropna().limit(10)

In [22]:
top_users.show()

+-------+-------------+
|user_id|count(rating)|
+-------+-------------+
|   9880|          195|
|   6678|          150|
|  15833|          148|
|  18539|          143|
|  19445|          132|
|   7261|          131|
|  14864|          129|
|  16279|          118|
|  17077|          115|
|   9217|          115|
+-------+-------------+



In [23]:
top_user_list = [row.user_id for row in top_users.select('user_id').collect()]

In [24]:
d = {}
for user in top_user_list:
    rec = userRecs.where(userRecs.user_id == user).select("recommendations").collect()
    d[user] = [i.book_id for i in rec[0]["recommendations"]]
d

                                                                                

{9880: [397183,
  15245,
  229575,
  816582,
  553387,
  116511,
  816581,
  879090,
  12625,
  43536],
 6678: [879090,
  45684,
  45685,
  276459,
  347571,
  43666,
  231962,
  483265,
  681751,
  394660],
 15833: [397182,
  397183,
  43488,
  320845,
  95232,
  864757,
  864758,
  116511,
  553387,
  816582],
 18539: [397183,
  397182,
  49566,
  15245,
  879090,
  125232,
  232035,
  44579,
  43666,
  28566],
 19445: [397183,
  397182,
  126284,
  229575,
  43536,
  879090,
  49566,
  88495,
  88494,
  426786],
 7261: [864757,
  864758,
  397183,
  816582,
  553387,
  816581,
  116511,
  397182,
  43488,
  126284],
 14864: [397183,
  397182,
  816582,
  816581,
  116511,
  553387,
  232035,
  43738,
  258334,
  377891],
 16279: [397183,
  377891,
  397182,
  229575,
  144350,
  879090,
  43536,
  816581,
  816582,
  116511],
 17077: [397183,
  397182,
  864757,
  864758,
  816582,
  116511,
  553387,
  816581,
  258334,
  232035],
 9217: [397183,
  397182,
  187792,
  879090,
  229

In [25]:
rec_1 = books.filter(books["book_id"].isin(d[9880]))

In [26]:
rec_1.show()

[Stage 593:>                                                        (0 + 8) / 8]                                                                                

+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|            genre|             authors|           publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
| 816582|If I Had You (de ...|History/Biography|[{'author_id': '1...|  Berkley Publishing|           4.2|          2000.0|[{'count': '649',...|['50742', '121591...|         1914|
| 229575|Hesiod: The Works...|           Poetry|[{'author_id': '1...|University of Mic...|        ironic|          1959.0|             and all|              Hesiod|         1827|
|  12625|Propaganda and th...|History/Biography|[{'author_id': '2...|         Pluto Press|          4.06|

In [27]:
rec_1.select('title').show(truncate=False)

+-----------------------------------------------------------------------+
|title                                                                  |
+-----------------------------------------------------------------------+
|Propaganda and the Public Mind                                         |
|The Return of the King (The Lord of the Rings, #3)                     |
|Hesiod: The Works and Days/Theogony/The Shield of Herakles             |
|Decisions (Sweet Valley High, #46)                                     |
|Wooden: A Lifetime of Observations and Reflections On and Off the Court|
|If I Had You (de Piaget, #2; de Piaget/MacLeod, #7)                    |
|Planetes, Volume 1 (Planetes, #1)                                      |
+-----------------------------------------------------------------------+



The Postmodern Condition: A Report on Knowledge = Philosophy(French), The Nimrod Flipout: Stories = Short Stories(Jewish/Hebrew), La Tempête = French, The Marx-Engels Reader = Philosophy(Russia), A Pawn for a Queen (Ursula Blanchard, #6) = Historical, My Discovery of America = Nonfiction(Russia)

In [28]:
rec_2 = books.filter(books["book_id"].isin(d[15833]))
rec_2.show()

+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|            genre|             authors|           publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
| 320845|The Practice of P...|           Poetry|[{'author_id': '1...|William Morrow Pa...|          4.01|          1992.0|[{'count': '920',...|['327469', '26612...|          990|
| 816582|If I Had You (de ...|History/Biography|[{'author_id': '1...|  Berkley Publishing|           4.2|          2000.0|[{'count': '649',...|['50742', '121591...|         1914|
| 864758|Knocked Out by My...|      Young Adult|[{'author_id': '6...|      Harper Collins|          3.97|

In [29]:
rec_2.select('title').show(truncate=False)

+------------------------------------------------------------------------------+
|title                                                                         |
+------------------------------------------------------------------------------+
|The Practice of Poetry: Writing Exercises From Poets Who Teach                |
|Decisions (Sweet Valley High, #46)                                            |
|If I Had You (de Piaget, #2; de Piaget/MacLeod, #7)                           |
|'... then he ate my boy entrancers.' (Confessions of Georgia Nicolson, Book 6)|
|Knocked Out by My Nunga-Nungas (Confessions of Georgia Nicolson, #3)          |
+------------------------------------------------------------------------------+



In [30]:
df2 = rec_1.dropDuplicates(["book_id"])

In [31]:
df2.show()

+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|            genre|             authors|           publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
| 816582|If I Had You (de ...|History/Biography|[{'author_id': '1...|  Berkley Publishing|           4.2|          2000.0|[{'count': '649',...|['50742', '121591...|         1914|
| 229575|Hesiod: The Works...|           Poetry|[{'author_id': '1...|University of Mic...|        ironic|          1959.0|             and all|              Hesiod|         1827|
|  12625|Propaganda and th...|History/Biography|[{'author_id': '2...|         Pluto Press|          4.06|