# Import Libraries & Tools

In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Import Data

In [2]:
my_books = pd.read_csv('liked_books.csv')

#ensure `book_id` is a string
my_books['book_id'] = my_books['book_id'].astype(str)
my_books.head()

Unnamed: 0,user_id,book_id,rating,title
0,-1,434903,5,The Rings of Saturn
1,-1,11047557,5,The Lord of the Rings
2,-1,29983711,5,Pachinko
3,-1,12073240,5,The Remains of the Day
4,-1,11297,5,Norwegian Wood


# Find Similar Users

In [3]:
#get the correct book_id values
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

#ensure no duplicates in our book list
book_set = set(my_books['book_id'])

In [4]:
#get # of times a user read a book on our list
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [5]:
#filter users who have read 25% or more of the books in my_books
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/4])

print(f'# of users who have read at least 1 book on our list: {len(overlap_users)}')
print(f'# of users who have read at least 25% of books on our list: {len(filtered_overlap_users)}')

# of users who have read at least 1 book on our list: 97461
# of users who have read at least 25% of books on our list: 92


# Find Similar Users Ratings

In [6]:
#get users book list
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

# Create Collaborative Filtering Matrix

In [7]:
#create df with similar users book lists and add our book list to it
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,434903,5
1,-1,11047557,5
2,-1,29983711,5
3,-1,12073240,5
4,-1,11297,5
...,...,...,...
1139179,437856,30091914,5
1139180,437856,23281518,4
1139181,437856,27430351,5
1139182,437856,33295190,0


In [8]:
#ensure columns are correct dtypes
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

#gives sequential index for user and book id
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

#calculates coo_matrix
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))
ratings_mat_coo.shape

(93, 421622)

# Finding Users Similar To Us

In [9]:
ratings_mat = ratings_mat_coo.tocsr()
interactions[interactions["user_id"] == "-1"].head()

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,434903,5,0,323091
1,-1,11047557,5,0,14591
2,-1,29983711,5,0,269638
3,-1,12073240,5,0,28964
4,-1,11297,5,0,18294


In [10]:
#get our index based on above
my_index = 0

#get similarities to other users (closer to 1 = more similar)
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()
similarity[0]

1.0000000000000002

In [11]:
#finds X indices of most similar users to us in terms of book tastes
find_similar_users = 30
indices = np.argpartition(similarity, -find_similar_users)[-find_similar_users:]
indices

array([28, 35, 82, 52, 67, 87, 19, 70, 21, 43, 73, 42, 75, 41,  2, 78, 85,
       29, 27, 53, 58, 80, 79, 72, 51, 39, 38, 22, 33,  0], dtype=int64)

In [12]:
#gets user id based on above indices
similar_users = interactions[interactions["user_index"].isin(indices)].copy()
similar_users = similar_users[similar_users["user_id"] != "-1"]
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
13628,3199,939870,4,53,412000
13629,3199,15654,4,53,73077
13630,3199,5204,3,53,335171
13631,3199,17650,4,53,109826
13632,3199,42155,4,53,321172
...,...,...,...,...,...
1139179,437856,30091914,5,79,270741
1139180,437856,23281518,4,79,193283
1139181,437856,27430351,5,79,247153
1139182,437856,33295190,0,79,293965


# Getting Book Recommendations

In [13]:
#how many time each book appeared in above recommendations
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,16,3.937500
10000,7,0.714286
10000191,1,0.000000
1000059,1,4.000000
100006,1,0.000000
...,...,...
9998705,1,0.000000
999894,1,0.000000
999898,1,0.000000
9999,7,0.000000


In [14]:
#gets book title so that the above makes more sense
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

#merge two datasets to get the titles into our recommendations
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,16,3.937500,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10000,7,0.714286,The Face of Another,2079,https://www.goodreads.com/book/show/10000.The_...,https://images.gr-assets.com/books/1320415026m...,the face of another
2,10000191,1,0.000000,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus
3,1000059,1,4.000000,"Green Arrow, Vol. 2: Sounds of Violence",1704,https://www.goodreads.com/book/show/1000059.Gr...,https://s.gr-assets.com/assets/nophoto/book/11...,green arrow vol 2 sounds of violence
4,100006,1,0.000000,Schrodinger's Kittens and the Search for Reali...,1095,https://www.goodreads.com/book/show/100006.Sch...,https://images.gr-assets.com/books/1344269008m...,schrodingers kittens and the search for realit...
...,...,...,...,...,...,...,...,...
58516,9998705,1,0.000000,"Flash and Bones (Temperance Brennan, #14)",14249,https://www.goodreads.com/book/show/9998705-fl...,https://images.gr-assets.com/books/1306253347m...,flash and bones temperance brennan 14
58517,999894,1,0.000000,"Bell, Book and Candle",133,https://www.goodreads.com/book/show/999894.Bel...,https://s.gr-assets.com/assets/nophoto/book/11...,bell book and candle
58518,999898,1,0.000000,Twelve Pillars,901,https://www.goodreads.com/book/show/999898.Twe...,https://images.gr-assets.com/books/1360435414m...,twelve pillars
58519,9999,7,0.000000,The Box Man,2791,https://www.goodreads.com/book/show/9999.The_B...,https://images.gr-assets.com/books/1320459929m...,the box man


# Ranking Our Book Recommendations

In [15]:
#normalized count of times the book appeared for similar users relative to others
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

#remove books we've already read based on id
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

#create mod_title (like in other notebook)
my_books["mod_title"] = (
    my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
    .str.lower()
    .str.replace("\s+", " ", regex=True)
)

#remove books we've already read based on mod_title
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

#remove books that appeared fewer than 2 times, whose mean rating > 4
book_recs = book_recs[book_recs["count"]>2]
book_recs = book_recs[book_recs["mean"] >=4]

#sort based on score
top_recs = book_recs.sort_values("score", ascending=False)

# Improving Display of Book Recommendations

In [16]:
def make_clickable(val):
    return f'<a target="_blank" href="{val}">Goodreads</a>'

def show_image(val):
    return f'<img src="{val}" width=50></img>'

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
47282,672172,3,4.0,The Altar of the Dead,237,Goodreads,,the altar of the dead,0.037975,0.151899
39919,475080,3,4.333333,The Suppliants,351,Goodreads,,the suppliants,0.025641,0.111111
19023,19107,3,4.333333,"Diary of a Madman, The Government Inspector, Selected Stories",646,Goodreads,,diary of a madman the government inspector selected stories,0.013932,0.060372
56900,9484,3,5.0,"Within a Budding Grove (In Search of Lost Time, #2)",813,Goodreads,,within a budding grove in search of lost time 2,0.01107,0.055351
42553,556967,3,4.0,Chamber Music,652,Goodreads,,chamber music,0.013804,0.055215
21947,21976060,3,4.333333,The Art of Stillness: Adventures in Going Nowhere,2902,Goodreads,,the art of stillness adventures in going nowhere,0.003101,0.013439
47544,6786692,3,4.0,Battle Royale,2705,Goodreads,,battle royale,0.003327,0.013309
7102,12951,3,4.0,The Turn of the Screw and Other Short Fiction,2929,Goodreads,,the turn of the screw and other short fiction,0.003073,0.012291
29679,28348,4,4.5,Crime and Punishment,6652,Goodreads,,crime and punishment,0.002405,0.010824
52384,80458,3,4.666667,The Future of an Illusion,3934,Goodreads,,the future of an illusion,0.002288,0.010676
