In [290]:
import pandas as pd
import numpy as np

# ! pip install pyspark
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import explode, col, round, abs, when
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.sql import SparkSession


In [291]:
spark = SparkSession \
    .builder \
    .appName("Book Rec System") \
    .getOrCreate()

# Reading Datasets

In [292]:
interactions = spark.read.csv("data/interactions.csv", sep=',', header=True)

In [293]:
interactions = interactions.withColumn("user_id",interactions["user_id"].cast("int"))
interactions = interactions.withColumn("rating",interactions["rating"].cast("int"))
interactions = interactions.withColumn("book_id",interactions["book_id"].cast("int"))

In [294]:
interactions.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- is_read: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- is_reviewed: string (nullable = true)
 |-- user_count: string (nullable = true)



In [295]:
interactions.show(n=5)

+-------+-------+-------+------+-----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|user_count|
+-------+-------+-------+------+-----------+----------+
|      0|    915|      1|     5|        1.0|        15|
|      0|    873|      1|     4|        0.0|        15|
|      0|    871|      1|     2|        0.0|        15|
|      0|    870|      1|     3|        0.0|        15|
|      0|    824|      1|     5|        1.0|        15|
+-------+-------+-------+------+-----------+----------+
only showing top 5 rows



In [296]:
books = spark.read.csv("data/books.csv", sep=',', header=True)

In [297]:
cols = ("isbn","is_ebook","kindle_asin", "country_code", "language_code", "asin","description", "format", "link", "publication_day", "isbn13",
        "publication_month", "edition_information", "url", "image_url", "work_id", "text_reviews_count", "title_without_series")

books = books.drop(*cols)
books.show(1)
books = books.select("book_id","title","genre","authors","publisher", "average_rating", "publication_year", "popular_shelves", "similar_books", "ratings_count")
books = books.dropDuplicates(["book_id"])

+------+--------------------+--------------+--------------------+--------------------+--------------+---------+----------------+-------+-------------+----------+--------+
|series|     popular_shelves|average_rating|       similar_books|             authors|     publisher|num_pages|publication_year|book_id|ratings_count|     title|   genre|
+------+--------------------+--------------+--------------------+--------------------+--------------+---------+----------------+-------+-------------+----------+--------+
|    []|[{'count': '450',...|          4.43|['834493', '45218...|[{'author_id': '5...|Blue Sky Press|     40.0|          1995.0|  89378|         1331|Dog Heaven|Children|
+------+--------------------+--------------+--------------------+--------------------+--------------+---------+----------------+-------+-------------+----------+--------+
only showing top 1 row



In [298]:
books.show(1)

+--------------------+--------------------+-----+------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|             book_id|               title|genre|           authors|           publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+--------------------+--------------------+-----+------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
| "" but his image...|[{'author_id': '1...|784.0| wild anachronisms| hallucinatory ha...|          4.12|          photos|[{'count': '522',...|['855422', '84351...|    Paperback|
+--------------------+--------------------+-----+------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
only showing top 1 row





# Training ALS with Cross Validation

In [299]:
(training, test) = interactions.randomSplit([0.8, 0.2])

als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)
model = als.fit(training)

In [300]:
param_grid = ParamGridBuilder() \
 .addGrid(als.rank, [10, 50, 75, 100]) \
 .addGrid(als.maxIter, [5, 50, 75, 100]) \
 .addGrid(als.regParam, [.01, .05, .1, .15]) \
 .build()

evaluator = RegressionEvaluator(metricName = "rmse", 
 labelCol = "rating", 
 predictionCol = "prediction")

print ("Num models to be tested using param_grid: ", len(param_grid))

Num models to be tested using param_grid:  64


In [301]:
cv = CrossValidator(estimator = als, 
 estimatorParamMaps = param_grid, 
 evaluator = evaluator, 
 numFolds = 5)

model = als.fit(training)
predictions = model.transform(test)

predictions.show(n = 10)

+-------+-------+-------+------+-----------+----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|user_count|prediction|
+-------+-------+-------+------+-----------+----------+----------+
|    471|  35870|      1|     4|        0.0|        19| 3.5998526|
|    471| 104069|      1|     5|        0.0|        19|   3.59567|
|   1238|   6571|      1|     4|        0.0|        26| 4.3061757|
|   1238|   6687|      1|     5|        0.0|        26|  9.313152|
|   1238|  59854|      1|     5|        0.0|        26| 4.5000806|
|   1238| 176728|      1|     5|        1.0|        26|  2.163161|
|   1645|  27905|      1|     3|        0.0|         7| 3.1309993|
|   1829|    586|      1|     4|        0.0|        11| 4.1695795|
|   2122| 129549|      1|     5|        0.0|        19|  9.140117|
|   2142|  16429|      1|     4|        1.0|        68| 1.6583396|
+-------+-------+-------+------+-----------+----------+----------+
only showing top 10 rows



In [302]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.6019383532927205


# Rounding Predictions

In [303]:
predictions = predictions.withColumn("rounded_prediction", round(col("prediction"), 0))
predictions = predictions.withColumn("rounded_prediction",predictions["rounded_prediction"].cast("int"))


In [304]:
new_pred = predictions.drop(col("prediction"))

new_pred = new_pred.withColumnRenamed("rounded_prediction", "prediction")
new_pred = new_pred.withColumn("prediction", when(new_pred["prediction"] > 5, 5).otherwise(new_pred['prediction']).alias("prediction"))

new_pred.show(n=10)

+-------+-------+-------+------+-----------+----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|user_count|prediction|
+-------+-------+-------+------+-----------+----------+----------+
|    471|  35870|      1|     4|        0.0|        19|         4|
|    471| 104069|      1|     5|        0.0|        19|         4|
|   1238|   6571|      1|     4|        0.0|        26|         4|
|   1238|   6687|      1|     5|        0.0|        26|         5|
|   1238|  59854|      1|     5|        0.0|        26|         5|
|   1238| 176728|      1|     5|        1.0|        26|         2|
|   1645|  27905|      1|     3|        0.0|         7|         3|
|   1829|    586|      1|     4|        0.0|        11|         4|
|   2122| 129549|      1|     5|        0.0|        19|         5|
|   2142|  16429|      1|     4|        1.0|        68|         2|
+-------+-------+-------+------+-----------+----------+----------+
only showing top 10 rows



In [308]:
rmse = evaluator.evaluate(new_pred)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.2777342385777766


In [307]:
new_pred = new_pred.withColumn("prediction",new_pred["prediction"].cast("double"))

accuracy = eval_accuracy.evaluate(new_pred)
accuracy



0.3503031918438062

# Recommendations

In [309]:
userRecs = model.recommendForAllUsers(10)
bookRecs = model.recommendForAllItems(5)



In [310]:
bookRecs.show(5, False)



+-------+-----------------------------------------------------------------------------------------------------+
|book_id|recommendations                                                                                      |
+-------+-----------------------------------------------------------------------------------------------------+
|236    |[{16981, 6.745122}, {19506, 6.353185}, {22797, 6.112206}, {22148, 6.076087}, {23119, 6.0540376}]     |
|1068   |[{22148, 6.8745823}, {16429, 6.5434446}, {16176, 6.534654}, {1157, 6.5090585}, {15436, 6.4165945}]   |
|1226   |[{10670, 6.841731}, {19055, 6.7031393}, {16981, 6.440398}, {22304, 6.359483}, {19506, 6.251373}]     |
|1265   |[{104, 9.535774}, {16981, 9.030468}, {15964, 8.220503}, {13353, 8.110318}, {8880, 8.084004}]         |
|1363   |[{17497, 11.920067}, {16957, 11.0461645}, {11396, 10.957737}, {16981, 10.707592}, {18494, 10.670895}]|
+-------+-----------------------------------------------------------------------------------------------

                                                                                

In [311]:
userRecs.show(5)



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|     26|[{864758, 18.5692...|
|     28|[{425380, 9.58905...|
|     31|[{64706, 11.14594...|
|     34|[{425380, 15.4735...|
|     44|[{864758, 15.3104...|
+-------+--------------------+
only showing top 5 rows



                                                                                

# Making score column in pandas

In [210]:
user_recs = model.recommendForAllUsers(100)



In [211]:
df = user_recs.toPandas()
df.head(1)

                                                                                

Unnamed: 0,user_id,recommendations
0,26,"[(864386, 17.755340576171875), (864758, 17.755..."
1,28,"[(397183, 10.6409330368042), (125232, 9.860329..."
2,31,"[(816581, 13.870176315307617), (816582, 13.870..."
3,34,"[(492565, 17.5319766998291), (432016, 16.87717..."
4,44,"[(688081, 11.493675231933594), (125232, 10.683..."
...,...,...
9917,22982,"[(43747, 21.323205947875977), (124740, 19.1596..."
9918,23098,"[(212936, 20.873018264770508), (397183, 20.474..."
9919,23161,"[(43747, 12.351412773132324), (183220, 11.7844..."
9920,23181,"[(397183, 9.964000701904297), (43747, 9.156210..."


In [212]:
df = df.assign(recommendations=list(df.recommendations)).explode('recommendations')

lst = [(x[0], x[1]) for x in df.recommendations]
df[['book_id', 'value']] = lst

In [213]:
max_values = df.groupby(['user_id'], sort=False)['value'].max()
df = df.merge(max_values, how='left', left_on='user_id', right_index=True)
df.columns = ['user_id', 'recommendations', 'book_id', 'value', 'max']
df['score'] = df['value'] / df['max']

In [214]:
df.head(1)

Unnamed: 0,user_id,recommendations,book_id,value,max,score
0,26,"(864386, 17.755340576171875)",864386.0,17.755341,17.755341,1.000000
0,26,"(864758, 17.755340576171875)",864758.0,17.755341,17.755341,1.000000
0,26,"(864757, 17.755340576171875)",864757.0,17.755341,17.755341,1.000000
0,26,"(432016, 15.268040657043457)",432016.0,15.268041,17.755341,0.859913
0,26,"(492565, 14.56356143951416)",492565.0,14.563561,17.755341,0.820236
...,...,...,...,...,...,...
9921,23187,"(45684, 12.045822143554688)",45684.0,12.045822,19.840441,0.607135
9921,23187,"(38969, 12.024578094482422)",38969.0,12.024578,19.840441,0.606064
9921,23187,"(127722, 12.022064208984375)",127722.0,12.022064,19.840441,0.605937
9921,23187,"(13083, 12.01167106628418)",13083.0,12.011671,19.840441,0.605414


# Exploring recommendations

In [312]:
top_users = interactions.groupBy(interactions['user_id']).agg({'rating':"count"}).sort("count(rating)", ascending=False).dropna().limit(10)

In [313]:
top_users.show()

+-------+-------------+
|user_id|count(rating)|
+-------+-------------+
|   9880|          195|
|   6678|          150|
|  15833|          148|
|  18539|          143|
|  19445|          132|
|   7261|          131|
|  14864|          129|
|  16279|          118|
|  17077|          115|
|   9217|          115|
+-------+-------------+



In [314]:
top_user_list = [row.user_id for row in top_users.select('user_id').collect()]

In [315]:
d = {}
for user in top_user_list:
    rec = userRecs.where(userRecs.user_id == user).select("recommendations").collect()
    d[user] = [i.book_id for i in rec[0]["recommendations"]]
d.keys()

                                                                                

dict_keys([9880, 6678, 15833, 18539, 19445, 7261, 14864, 16279, 17077, 9217])

In [316]:
rec_1 = books.filter(books["book_id"].isin(d[7261]))

In [317]:
rec_1.show()

[Stage 3885:>                                                       (0 + 8) / 8]

+-------+--------------------+-----------+--------------------+-----------------+--------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|      genre|             authors|        publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+-----------+--------------------+-----------------+--------------+----------------+--------------------+--------------------+-------------+
|  47733|          Love Poems|     Poetry|[{'author_id': '2...|    Mariner Books|          4.18|          1999.0|[{'count': '1631'...|['162904', '40014...|         1214|
| 864758|Knocked Out by My...|Young Adult|[{'author_id': '6...|   Harper Collins|          3.97|          2006.0|[{'count': '6704'...|['405838', '51618...|          957|
| 147924|            Alphabet|     Poetry|[{'author_id': '8...|   New Directions|          4.42|          2001.0|[{'count': '796',...|['6923043', '139

                                                                                

Young adult and history/biography. The user has read a lot of history/biography but not much young adult so thats a bit of a strange recommendation.

In [318]:
user1_books = [x[0] for x in interactions.filter(interactions.user_id == 7261).select('book_id').collect()]
user1 = books.filter(books["book_id"].isin(user1_books))
user1.show()

+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|            genre|             authors|           publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
| 118222|The Corpse in Ooz...|          Mystery|[{'author_id': '6...|The Mysterious Press|          3.76|          1988.0|[{'count': '149',...|['673410', '86058...|          453|
| 128605|Chester Cricket's...|         Children|[{'author_id': '1...|            Yearling|           3.7|          1984.0|[{'count': '149',...|['460620', '10460...|          266|
|  12937|See You Around, S...|         Children|[{'author_id': '2...|            Yearling|          3.66|

In [319]:
rec_2 = books.filter(books["book_id"].isin(d[17077]))
rec_2.show()

+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|            genre|             authors|           publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
| 864758|Knocked Out by My...|      Young Adult|[{'author_id': '6...|      Harper Collins|          3.97|          2006.0|[{'count': '6704'...|['405838', '51618...|          957|
| 147924|            Alphabet|           Poetry|[{'author_id': '8...|      New Directions|          4.42|          2001.0|[{'count': '796',...|['6923043', '1397...|          596|
| 177820|       Ester's Child|History/Biography|[{'author_id': '5...|                null|          4.01|

In [320]:
user2_books = [x[0] for x in interactions.filter(interactions.user_id == 17077).select('book_id').collect()]
user2 = books.filter(books["book_id"].isin(user2_books))
user2.show()

+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|            genre|             authors|           publisher|average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+-----------------+--------------------+--------------------+--------------+----------------+--------------------+--------------------+-------------+
|  14654|Encyclopedia Brow...|         Children|[{'author_id': '9...|            Yearling|          4.04|          2000.0|[{'count': '303',...|['363271', '51071...|         3043|
|   1496|  Iphigenia in Aulis|           Poetry|[{'author_id': '9...|Ivan R. Dee Publi...|          3.99|          1997.0|[{'count': '104',...|['237794', '76170...|         2211|
|  16320|Witness for the P...|          Mystery|[{'author_id': '1...|HarperCollins Pub...|          4.05|

Has read some of every genre so any recommendation would make sense really.