In [67]:
import pandas as pd
import numpy as np

# ! pip install pyspark
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import explode, col, round
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession


In [12]:
spark = SparkSession \
    .builder \
    .appName("Book Rec System") \
    .getOrCreate()

In [214]:
books = pd.read_csv("data/books.csv")
books.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,0850308712,5,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,3.4,,...,,,,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,0,15,278577,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...
1,0842379428,566,[],US,eng,"[{'count': '6393', 'name': 'to-read'}, {'count...",,False,4.26,B000FCKCJC,...,,,,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,1,7345,86257,Heaven,Heaven
2,037583687X,615,[],US,,"[{'count': '4248', 'name': 'to-read'}, {'count...",,False,3.98,B0010SEMV4,...,7.0,,2006.0,https://www.goodreads.com/book/show/89377.Penn...,https://images.gr-assets.com/books/1320470906m...,2,6949,86258,Penny from Heaven,Penny from Heaven
3,0590417010,193,[],US,eng,"[{'count': '450', 'name': 'to-read'}, {'count'...",,False,4.43,B017RORXNI,...,9.0,,1995.0,https://www.goodreads.com/book/show/89378.Dog_...,https://images.gr-assets.com/books/1360057676m...,3,1331,86259,Dog Heaven,Dog Heaven
4,0553576348,41,[],US,,"[{'count': '487', 'name': 'to-read'}, {'count'...",,False,4.09,B008WOUJOI,...,,,,https://www.goodreads.com/book/show/89379.Hell...,https://images.gr-assets.com/books/1320506798m...,4,627,86260,Hello from Heaven: A New Field of Research-Aft...,Hello from Heaven: A New Field of Research-Aft...


In [222]:
interactions[interactions['user_id'] == 840]
(interactions == 0).astype(int).sum(axis=0)

user_id          182
book_id           11
is_read        72235
rating         73940
is_reviewed    96504
dtype: int64

In [221]:
interactions = pd.read_csv("data/interactions.csv")
interactions.describe()

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
count,102692.0,102692.0,102692.0,102692.0,102692.0
mean,499.956774,2033.030976,0.296586,1.094944,0.060258
std,286.278383,1166.001171,0.456755,1.827248,0.237965
min,0.0,0.0,0.0,0.0,0.0
25%,253.0,1021.0,0.0,0.0,0.0
50%,502.0,2022.0,0.0,0.0,0.0
75%,751.0,3028.0,1.0,3.0,0.0
max,985.0,4120.0,1.0,5.0,1.0


In [7]:
books.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [9]:
books = books[['text_reviews_count', 'popular_shelves', 'average_rating', 'similar_books', 'description', 'authors',
       'publisher', 'num_pages', 'publication_year','book_id', 'ratings_count', 'title']]

In [118]:
ratings = spark.read.format("csv").option("header","true").load("data/interactions.csv")

In [119]:
ratings = ratings.withColumn("user_id",ratings["user_id"].cast("int"))
ratings = ratings.withColumn("rating",ratings["rating"].cast("int"))
ratings = ratings.withColumn("book_id",ratings["book_id"].cast("int"))

In [120]:
ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- is_read: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- is_reviewed: string (nullable = true)



In [121]:
ratings.show(n=5)

+-------+-------+-------+------+-----------+
|user_id|book_id|is_read|rating|is_reviewed|
+-------+-------+-------+------+-----------+
|      0|   1152|      0|     0|          0|
|      0|   2140|      1|     4|          0|
|      0|   2139|      1|     3|          0|
|      0|   2138|      0|     0|          0|
|      0|   2137|      1|     4|          0|
+-------+-------+-------+------+-----------+
only showing top 5 rows



In [105]:
ratings.describe().show()

+-------+-----------------+------------------+-------------------+------------------+-------------------+
|summary|          user_id|           book_id|            is_read|            rating|        is_reviewed|
+-------+-----------------+------------------+-------------------+------------------+-------------------+
|  count|           102692|            102692|             102692|            102692|             102692|
|   mean|499.9567736532544| 2033.030976122775|0.29658590737350526|1.0949441047014372|0.06025785845051221|
| stddev|  286.27838343667|1166.0011713282875|0.45675457138247505| 1.827247903970249| 0.2379651242829317|
|    min|                0|                 0|                  0|                 0|                  0|
|    max|              985|              4120|                  1|                 5|                  1|
+-------+-----------------+------------------+-------------------+------------------+-------------------+



In [20]:
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)
model = als.fit(training)

[Stage 3:>                                                          (0 + 1) / 1]                                                                                

In [21]:
param_grid = ParamGridBuilder() \
 .addGrid(als.rank, [10, 50, 75, 100]) \
 .addGrid(als.maxIter, [5, 50, 75, 100]) \
 .addGrid(als.regParam, [.01, .05, .1, .15]) \
 .build()
# Define evaluator as RMSE
evaluator = RegressionEvaluator(metricName = "rmse", 
 labelCol = "rating", 
 predictionCol = "prediction")
# Print length of evaluator
print ("Num models to be tested using param_grid: ", len(param_grid))

Num models to be tested using param_grid:  64


In [85]:
cv = CrossValidator(estimator = als, 
 estimatorParamMaps = param_grid, 
 evaluator = evaluator, 
 numFolds = 5)

model = als.fit(training)
predictions = model.transform(test)

predictions.show(n = 10)

+-------+-------+-------+------+-----------+-----------+
|user_id|book_id|is_read|rating|is_reviewed| prediction|
+-------+-------+-------+------+-----------+-----------+
|    148|    252|      1|     3|          0|  2.6050725|
|    148|    586|      0|     0|          0|  0.3192272|
|    148|    919|      0|     0|          0| 0.28434813|
|    148|    984|      1|     3|          0|   1.370568|
|    148|   1176|      0|     0|          0|0.066068426|
|    148|   1200|      0|     0|          0|  0.7834935|
|    148|   1600|      0|     0|          0|   0.600194|
|    148|   2438|      1|     0|          0|   1.750623|
|    148|   2533|      1|     4|          0| 0.35663316|
|    148|   3103|      1|     4|          0|   1.405108|
+-------+-------+-------+------+-----------+-----------+
only showing top 10 rows



In [97]:
predictions = predictions.withColumn("rounded_prediction", round(col("prediction"), 0))
predictions = predictions.withColumn("rounded_prediction",predictions["rounded_prediction"].cast("int"))


In [87]:
new_pred = predictions.drop(col("prediction"))
new_pred = new_pred.withColumnRenamed("rounded_prediction", "prediction")
new_pred.show(n=10)

+-------+-------+-------+------+-----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|prediction|
+-------+-------+-------+------+-----------+----------+
|    148|    252|      1|     3|          0|       3.0|
|    148|    586|      0|     0|          0|       0.0|
|    148|    919|      0|     0|          0|       0.0|
|    148|    984|      1|     3|          0|       1.0|
|    148|   1176|      0|     0|          0|       0.0|
|    148|   1200|      0|     0|          0|       1.0|
|    148|   1600|      0|     0|          0|       1.0|
|    148|   2438|      1|     0|          0|       2.0|
|    148|   2533|      1|     4|          0|       0.0|
|    148|   3103|      1|     4|          0|       1.0|
+-------+-------+-------+------+-----------+----------+
only showing top 10 rows



In [79]:
df = predictions.select("*").toPandas()
# df['pred_round'] = df['prediction'].round().astype('int')
df

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed,prediction,rounded_prediction
0,148,252,1,3,0,2.605072,3
1,148,586,0,0,0,0.319227,0
2,148,919,0,0,0,0.284348,0
3,148,984,1,3,0,1.370568,1
4,148,1176,0,0,0,0.066068,0
...,...,...,...,...,...,...,...
20658,517,3437,0,0,0,0.042132,0
20659,517,3578,1,4,0,1.645570,2
20660,517,3757,0,0,0,0.031650,0
20661,517,3940,0,0,0,0.235914,0


In [88]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.8117908143016066


In [93]:
predictions = predictions.withColumn("prediction",predictions["prediction"].cast("double"))

eval_accuracy = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="precisionByLabel")
eval_recall = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="recallByLabel")
eval_f1 = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="f1")

accuracy = eval_accuracy.evaluate(predictions)
precision = eval_precision.evaluate(predictions)
recall = eval_recall.evaluate(predictions)
f1score = eval_f1.evaluate(predictions)

In [None]:
print(accuracy, precision, recall, f1score)
predictions.show(n=10)

In [101]:
new_pred = new_pred.withColumn("prediction",new_pred["prediction"].cast("double"))

eval_accuracy = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="precisionByLabel")
eval_recall = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="recallByLabel")
eval_f1 = MulticlassClassificationEvaluator(labelCol="rating", predictionCol="prediction", metricName="f1")

accuracy = eval_accuracy.evaluate(new_pred)
precision = eval_precision.evaluate(new_pred)
recall = eval_recall.evaluate(new_pred)
f1score = eval_f1.evaluate(new_pred)

In [102]:
print(accuracy, precision, recall, f1score)
new_pred.show(n=10)

0.4493055219474423 0.890937338166753 0.5779360386992743 0.5410137083997811
+-------+-------+-------+------+-----------+----------+
|user_id|book_id|is_read|rating|is_reviewed|prediction|
+-------+-------+-------+------+-----------+----------+
|    148|    252|      1|   3.0|          0|       3.0|
|    148|    586|      0|   0.0|          0|       0.0|
|    148|    919|      0|   0.0|          0|       0.0|
|    148|    984|      1|   3.0|          0|       1.0|
|    148|   1176|      0|   0.0|          0|       0.0|
|    148|   1200|      0|   0.0|          0|       1.0|
|    148|   1600|      0|   0.0|          0|       1.0|
|    148|   2438|      1|   0.0|          0|       2.0|
|    148|   2533|      1|   4.0|          0|       0.0|
|    148|   3103|      1|   4.0|          0|       1.0|
+-------+-------+-------+------+-----------+----------+
only showing top 10 rows



In [25]:
# Generate n recommendations for all users
ALS_recommendations = model.recommendForAllUsers(numItems = 10) # n — 10
ALS_recommendations.show(n = 10)
# Temporary table
ALS_recommendations.registerTempTable("ALS_recs_temp")
clean_recs = spark.sql("""SELECT user_id,
                            movieIds_and_ratings.book_id AS book_id,
                            movieIds_and_ratings.rating AS prediction
                        FROM ALS_recs_temp
                        LATERAL VIEW explode(recommendations) exploded_table
                            AS movieIds_and_ratings""")
clean_recs.show()
# Recommendations for unread books
(clean_recs.join(data, ["user_id", "book_id"], "left")
    .filter(data.rating.isNull()).show())
new_books = (clean_recs.join(data, ["user_id", "book_id"], "left")
    .filter(data.rating.isNull()))

                                                                                

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[{2819, 21.728657...|
|      3|[{1925, 4.797371}...|
|      6|[{1470, 7.576744}...|
|     12|[{1572, 13.686593...|
|     13|[{3973, 11.968611...|
|     16|[{2363, 10.698648...|
|     20|[{139, 9.438195},...|
|     22|[{2819, 17.328636...|
|     26|[{55, 3.8167355},...|
|     27|[{3351, 3.5918908...|
+-------+--------------------+
only showing top 10 rows



                                                                                

+-------+-------+----------+
|user_id|book_id|prediction|
+-------+-------+----------+
|      1|   2819| 21.728657|
|      1|   3665| 21.721382|
|      1|   2864| 20.891443|
|      1|   2817| 18.995947|
|      1|    611| 16.814068|
|      1|   3556| 16.517403|
|      1|   2427| 16.447178|
|      1|    931| 15.123379|
|      1|     21| 14.564141|
|      1|   1308| 14.135394|
|      3|   1925|  4.797371|
|      3|   2549| 4.7153983|
|      3|   4094| 4.5428457|
|      3|   3695| 4.4199724|
|      3|   1508| 4.2795377|
|      3|   2760|  4.231721|
|      3|   1071| 4.2214055|
|      3|   1614|  4.210667|
|      3|   1172| 4.1264715|
|      3|   2044| 4.0814023|
+-------+-------+----------+
only showing top 20 rows



                                                                                

+-------+-------+----------+-------+------+-----------+
|user_id|book_id|prediction|is_read|rating|is_reviewed|
+-------+-------+----------+-------+------+-----------+
|      1|   2819| 21.728657|   null|  null|       null|
|      1|   3665| 21.721382|   null|  null|       null|
|      1|   2864| 20.891443|   null|  null|       null|
|      1|   2817| 18.995947|   null|  null|       null|
|      1|    611| 16.814068|   null|  null|       null|
|      1|   3556| 16.517403|   null|  null|       null|
|      1|   2427| 16.447178|   null|  null|       null|
|      1|    931| 15.123379|   null|  null|       null|
|      1|     21| 14.564141|   null|  null|       null|
|      1|   1308| 14.135394|   null|  null|       null|
|      3|   1925|  4.797371|   null|  null|       null|
|      3|   2549| 4.7153983|   null|  null|       null|
|      3|   4094| 4.5428457|   null|  null|       null|
|      3|   3695| 4.4199724|   null|  null|       null|
|      3|   1508| 4.2795377|   null|  null|     

In [48]:
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
bookRecs = model.recommendForAllItems(10)



In [50]:
bookRecs.show()



+-------+--------------------+
|book_id|     recommendations|
+-------+--------------------+
|     26|[{0, 0.0}, {10, 0...|
|     27|[{0, 0.0}, {10, 0...|
|     28|[{0, 0.0}, {10, 0...|
|     31|[{0, 0.0}, {10, 0...|
|     34|[{9, 5.4554815}, ...|
|     44|[{460, 10.205811}...|
|     53|[{0, 0.0}, {10, 0...|
|     65|[{602, 9.247291},...|
|     76|[{205, 8.622194},...|
|     78|[{0, 0.0}, {10, 0...|
|     81|[{884, 6.0798645}...|
|     85|[{0, 0.0}, {10, 0...|
|    101|[{541, 10.089077}...|
|    103|[{541, 5.491219},...|
|    108|[{539, 7.6309114}...|
|    115|[{460, 6.7112637}...|
|    126|[{55, 8.063753}, ...|
|    137|[{355, 2.8915582}...|
|    148|[{0, 0.0}, {10, 0...|
|    155|[{205, 9.592584},...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [53]:
book_recs = bookRecs.select("*").toPandas()

                                                                                

In [54]:
book_recs

Unnamed: 0,book_id,recommendations
0,26,"[(0, 0.0), (10, 0.0), (20, 0.0), (30, 0.0), (4..."
1,27,"[(0, 0.0), (10, 0.0), (20, 0.0), (30, 0.0), (4..."
2,28,"[(0, 0.0), (10, 0.0), (20, 0.0), (30, 0.0), (4..."
3,31,"[(0, 0.0), (10, 0.0), (20, 0.0), (30, 0.0), (4..."
4,34,"[(9, 5.45548152923584), (541, 4.90243196487426..."
...,...,...
4040,4095,"[(0, 0.0), (10, 0.0), (20, 0.0), (30, 0.0), (4..."
4041,4106,"[(205, 11.784050941467285), (9, 10.53423690795..."
4042,4108,"[(884, 11.928763389587402), (700, 10.219456672..."
4043,4112,"[(103, 11.286194801330566), (55, 10.4209899902..."


In [63]:
movie_recs.loc[0,:].recommendations

[Row(user_id=0, rating=0.0),
 Row(user_id=10, rating=0.0),
 Row(user_id=20, rating=0.0),
 Row(user_id=30, rating=0.0),
 Row(user_id=40, rating=0.0),
 Row(user_id=50, rating=0.0),
 Row(user_id=60, rating=0.0),
 Row(user_id=70, rating=0.0),
 Row(user_id=80, rating=0.0),
 Row(user_id=90, rating=0.0)]

In [140]:
books = spark.read.format("csv").option("header","true").load("data/books.csv")

In [164]:
cols = ("isbn","is_ebook","kindle_asin", "country_code", "language_code", "asin","description", "format", "link", "publication_day", "isbn13",
        "publication_month", "edition_information", "url", "image_url", "work_id", "text_reviews_count", "title_without_series")

books = books.drop(*cols)
books = books.select("book_id","title","series","authors","publisher", "average_rating", "publication_year", "popular_shelves", "similar_books", "ratings_count")

In [165]:
books.show(n=5)

+-------+--------------------+--------------------+--------------------+---------+--------------------+----------------+--------------------+--------------------+-------------+
|book_id|               title|              series|             authors|publisher|      average_rating|publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+--------------------+--------------------+---------+--------------------+----------------+--------------------+--------------------+-------------+
|      0|Runic Astrology: ...|                  []|[{'author_id': '1...|     null|                 3.4|            null|[{'count': '32', ...|                  []|           15|
|   null|                null|                  []|                null|     null|                4.26|            null|[{'count': '6393'...|['45366', '246849...|         null|
|   null|                null| Dr. Randy Alcorn...|                null|     null|                null|            

In [166]:
userRecs.show(n=5)



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[{2819, 21.728657...|
|      3|[{1925, 4.797371}...|
|      6|[{1470, 7.576744}...|
|     12|[{1572, 13.686593...|
|     13|[{3973, 11.968611...|
+-------+--------------------+
only showing top 5 rows



                                                                                

In [187]:
top_users = ratings.groupBy(ratings['user_id']).agg({'rating':"count"}).sort("count(rating)", ascending=False).dropna().limit(10)

In [188]:
top_users.show()

+-------+-------------+
|user_id|count(rating)|
+-------+-------------+
|    840|          874|
|    706|          684|
|    156|          604|
|    253|          574|
|    671|          553|
|    894|          550|
|    581|          540|
|    755|          518|
|    259|          508|
|    474|          503|
+-------+-------------+



In [189]:
top_user_list = [row.user_id for row in top_users.select('user_id').collect()]

In [190]:
d = {}

for user in top_user_list:
    rec = userRecs.where(userRecs.user_id == user).select("recommendations").collect()
    d[user] = [i.book_id for i in rec[0]["recommendations"]]

d

{840: [2752, 296, 1870, 1670, 2651, 3876, 2579, 3471, 895, 4051],
 706: [3957, 827, 0, 1392, 569, 1336, 3374, 1470, 1356, 2],
 156: [2819, 3665, 2864, 2817, 2427, 611, 3556, 931, 21, 1133],
 253: [3973, 1717, 3471, 3127, 3425, 3657, 3351, 255, 3113, 2596],
 671: [1925, 4094, 2549, 1614, 3695, 2488, 1508, 1071, 2044, 2760],
 894: [3973, 1717, 3127, 3471, 3351, 3425, 255, 3657, 3113, 2300],
 581: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
 755: [692, 691, 538, 1695, 1791, 4098, 3322, 2385, 2948, 1606],
 259: [1831, 1925, 643, 1572, 251, 1172, 2907, 3937, 2549, 2945],
 474: [3665, 2819, 2864, 2817, 611, 3556, 2427, 931, 1308, 21]}

In [209]:
rec_1 = books.filter(books["book_id"].isin(d[581]))

In [210]:
rec = books.filter(books["book_id"].isin(296))
rec.show()


+-------+-----+------+-------+---------+--------------+----------------+---------------+-------------+-------------+
|book_id|title|series|authors|publisher|average_rating|publication_year|popular_shelves|similar_books|ratings_count|
+-------+-----+------+-------+---------+--------------+----------------+---------------+-------------+-------------+
+-------+-----+------+-------+---------+--------------+----------------+---------------+-------------+-------------+



In [211]:
rec_1.show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|book_id|               title|              series|             authors|           publisher|      average_rating|    publication_year|     popular_shelves|       similar_books|ratings_count|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|      0|Runic Astrology: ...|                  []|[{'author_id': '1...|                null|                 3.4|                null|[{'count': '32', ...|                  []|           15|
|     20|Constructing the ...|                  []|[{'author_id': '1...|University Of Chi...|                3.82|              1988.0|[{'count': '51', ...|                  []|           65|
|     30|One Pound Gospel,...|          

In [212]:
rec_1.select('authors').show(truncate=False)

+-------------------------------------+
|authors                              |
+-------------------------------------+
|[{'author_id': '149918', 'role': ''}]|
|[{'author_id': '199983', 'role': ''}]|
|[{'author_id': '12948', 'role': ''}] |
|Peter Owen Publishers                |
|[{'author_id': '10229', 'role': ''}] |
+-------------------------------------+



In [213]:
rec_1.select('title').show(truncate=False)

+--------------------------------------------------------------------+
|title                                                               |
+--------------------------------------------------------------------+
|Runic Astrology: Starcraft and Timekeeping in the Northern Tradition|
|Constructing the Political Spectacle                                |
|One Pound Gospel, Volume 3. Knuckle Sandwich.                       |
|Weights and Measures                                                |
|The Berlin Stories: The Last of Mr Norris/Goodbye to Berlin         |
+--------------------------------------------------------------------+



The Postmodern Condition: A Report on Knowledge = Philosophy(French), The Nimrod Flipout: Stories = Short Stories(Jewish/Hebrew), La Tempête = French, The Marx-Engels Reader = Philosophy(Russia), A Pawn for a Queen (Ursula Blanchard, #6) = Historical, My Discovery of America = Nonfiction(Russia)

In [186]:
user_1 = ratings.filter(ratings["user_id"].isin(481))
l = [row.book_id for row in user_1.select('book_id').collect()]
l

[2652,
 3009,
 568,
 565,
 567,
 566,
 1961,
 4032,
 4031,
 1898,
 3168,
 3169,
 1057,
 3262,
 3413,
 3044,
 1899,
 1483,
 3905,
 1056,
 1559,
 1464,
 3279,
 610,
 2027,
 3292,
 1465,
 3434,
 3432,
 3433,
 783,
 393,
 3919,
 2542,
 2616,
 809,
 1921,
 2868,
 3511,
 1895,
 638,
 2168,
 3183,
 2134,
 2395,
 163,
 3188,
 1293,
 1643,
 2353,
 34,
 637,
 4003,
 928,
 1362,
 126,
 1558,
 1350,
 395,
 2285,
 2284,
 2274,
 3490,
 319,
 2356,
 2495,
 4028,
 1769,
 1500,
 1501,
 4079,
 4026,
 169,
 1140,
 1138,
 1135,
 1139,
 1361,
 2676,
 2677,
 3382,
 1897,
 1896,
 427,
 3826,
 2470,
 1360,
 2258,
 3058,
 2982,
 1732,
 1329,
 3057,
 1668,
 3703,
 3323,
 1030,
 914,
 3862,
 2526,
 152,
 2995,
 2588,
 1480,
 873,
 3439,
 1657,
 1775,
 4004,
 1159,
 551,
 1830,
 2276,
 2389,
 2524,
 2523,
 3383,
 2379,
 3071,
 600,
 3384,
 2528,
 3942,
 3291,
 2587,
 1136,
 1752,
 420,
 3903,
 2825,
 3832,
 4005,
 454,
 954,
 3697,
 970,
 1163,
 646,
 2633,
 377,
 3608,
 1694,
 571,
 2823,
 3941,
 3068,
 3049,
 3