# Import Libraries & Tools

In [35]:
import pandas as pd
import numpy as np

from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Import Data

In [57]:
my_books = pd.read_csv('liked_books.csv')

#ensure `book_id` is a string
my_books['book_id'] = my_books['book_id'].astype(str)
my_books.head()

Unnamed: 0,user_id,book_id,rating,title
0,-1,434903,5,The Rings of Saturn
1,-1,11047557,5,The Lord of the Rings
2,-1,29983711,5,Pachinko
3,-1,12073240,5,The Remains of the Day
4,-1,11297,5,Norwegian Wood


# Find Similar Users

In [None]:
#get the correct book_id values
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

#ensure no duplicates in our book list
book_set = set(my_books['book_id'])

In [9]:
#get # of times a user read a book on our list
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [28]:
#filter users who have read 25% or more of the books in my_books
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/4])

print(f'# of users who have read at least 1 book on our list: {len(overlap_users)}')
print(f'# of users who have read at least 25% of books on our list: {len(filtered_overlap_users)}')

# Find Similar Users Ratings

In [21]:
#get users book list
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

# Create Collaborative Filtering Matrix

In [25]:
#create df with similar users book lists and add our book list to it
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,434903,5
1,-1,11047557,5
2,-1,29983711,5
3,-1,12073240,5
4,-1,11297,5
...,...,...,...
1139179,437856,30091914,5
1139180,437856,23281518,4
1139181,437856,27430351,5
1139182,437856,33295190,0


In [27]:
#ensure columns are correct dtypes
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

#gives sequential index for user and book id
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [30]:
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))
ratings_mat_coo.shape

(93, 421622)

In [36]:
ratings_mat = ratings_mat_coo.tocsr()
interactions[interactions["user_id"] == "-1"].head()

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,434903,5,0,323091
1,-1,11047557,5,0,14591
2,-1,29983711,5,0,269638
3,-1,12073240,5,0,28964
4,-1,11297,5,0,18294


In [32]:
#get our index based on above
my_index = 0

In [56]:
#get similarities to other users (closer to 1 = more similar)
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()
similarity[0]

1.0000000000000007

In [51]:
#finds X indices of most similar users to us in terms of book tastes
find_similar_users = 30
indices = np.argpartition(similarity, -find_similar_users)[-find_similar_users:]
indices

array([28, 82, 53, 19, 67, 21, 43, 70, 42, 85, 73, 87, 75, 35, 27, 78, 52,
       58, 80, 79, 72, 51, 41, 39, 38,  2, 22, 33, 29,  0], dtype=int64)

In [46]:
#gets user id based on above indices
similar_users = interactions[interactions["user_index"].isin(indices)].copy()
similar_users = similar_users[similar_users["user_id"] != "-1"]
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
16277,7869,5107,5,87,333837
16278,7869,2657,5,87,239600
16279,7869,4671,5,87,327692
16280,7869,10975,5,87,13525
16281,7869,12220,5,87,30878
...,...,...,...,...,...
1139179,437856,30091914,5,79,270741
1139180,437856,23281518,4,79,193283
1139181,437856,27430351,5,79,247153
1139182,437856,33295190,0,79,293965


In [47]:
#how many time each book appeared in above recommendations
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,4.166667
10000,5,1.000000
100006,1,0.000000
10000613,2,0.000000
10001693,1,0.000000
...,...,...
9997,1,0.000000
9998,7,0.714286
999816,1,0.000000
999898,1,0.000000


In [49]:
#gets book title so that the above makes more sense
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

#merge two datasets to get the titles into our recommendations
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")
book_recs

Unnamed: 0,book_id,count,mean,title_x,ratings_x,url_x,cover_image_x,mod_title_x,title_y,ratings_y,url_y,cover_image_y,mod_title_y
0,1,6,4.166667,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10000,5,1.000000,The Face of Another,2079,https://www.goodreads.com/book/show/10000.The_...,https://images.gr-assets.com/books/1320415026m...,the face of another,The Face of Another,2079,https://www.goodreads.com/book/show/10000.The_...,https://images.gr-assets.com/books/1320415026m...,the face of another
2,100006,1,0.000000,Schrodinger's Kittens and the Search for Reali...,1095,https://www.goodreads.com/book/show/100006.Sch...,https://images.gr-assets.com/books/1344269008m...,schrodingers kittens and the search for realit...,Schrodinger's Kittens and the Search for Reali...,1095,https://www.goodreads.com/book/show/100006.Sch...,https://images.gr-assets.com/books/1344269008m...,schrodingers kittens and the search for realit...
3,10000613,2,0.000000,Getting Better: Why Global Development Is Succ...,122,https://www.goodreads.com/book/show/10000613-g...,https://images.gr-assets.com/books/1328841746m...,getting better why global development is succe...,Getting Better: Why Global Development Is Succ...,122,https://www.goodreads.com/book/show/10000613-g...,https://images.gr-assets.com/books/1328841746m...,getting better why global development is succe...
4,10001693,1,0.000000,"The Hard Way (Jack Reacher, #10)",286,https://www.goodreads.com/book/show/10001693-t...,https://images.gr-assets.com/books/1328909432m...,the hard way jack reacher 10,"The Hard Way (Jack Reacher, #10)",286,https://www.goodreads.com/book/show/10001693-t...,https://images.gr-assets.com/books/1328909432m...,the hard way jack reacher 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33440,9997,1,0.000000,The Ruined Map,1158,https://www.goodreads.com/book/show/9997.The_R...,https://images.gr-assets.com/books/1320459931m...,the ruined map,The Ruined Map,1158,https://www.goodreads.com/book/show/9997.The_R...,https://images.gr-assets.com/books/1320459931m...,the ruined map
33441,9998,7,0.714286,The Woman in the Dunes,11841,https://www.goodreads.com/book/show/9998.The_W...,https://images.gr-assets.com/books/1361254930m...,the woman in the dunes,The Woman in the Dunes,11841,https://www.goodreads.com/book/show/9998.The_W...,https://images.gr-assets.com/books/1361254930m...,the woman in the dunes
33442,999816,1,0.000000,Home to Harlem,882,https://www.goodreads.com/book/show/999816.Hom...,https://s.gr-assets.com/assets/nophoto/book/11...,home to harlem,Home to Harlem,882,https://www.goodreads.com/book/show/999816.Hom...,https://s.gr-assets.com/assets/nophoto/book/11...,home to harlem
33443,999898,1,0.000000,Twelve Pillars,901,https://www.goodreads.com/book/show/999898.Twe...,https://images.gr-assets.com/books/1360435414m...,twelve pillars,Twelve Pillars,901,https://www.goodreads.com/book/show/999898.Twe...,https://images.gr-assets.com/books/1360435414m...,twelve pillars


In [42]:
#normalized count of times the book appeared for similar users relative to others
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

#remove books we've already read based on id
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

#create mod_title (like in other notebook)
my_books["mod_title"] = (
    my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
    .str.lower()
    .str.replace("\s+", " ", regex=True)
)

#remove books we've already read based on mod_title
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

#remove books that appeared fewer than 2 times, whose mean rating > 4
book_recs = book_recs[book_recs["count"]>2]
book_recs = book_recs[book_recs["mean"] >=4]

#sort based on score
top_recs = book_recs.sort_values("score", ascending=False)

In [50]:
def make_clickable(val):
    return f'<a target="_blank" href="{val}">Goodreads</a>'

def show_image(val):
    return f'<img src="{val}" width=50></img>'

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
6196,15241,5,4.8,"The Two Towers (The Lord of the Rings, #2)",490005,Goodreads,,the two towers the lord of the rings 2,5.1e-05,0.000245
9689,18512,5,4.8,"The Return of the King (The Lord of the Rings, #3)",473101,Goodreads,,the return of the king the lord of the rings 3,5.3e-05,0.000254
25361,59960,3,4.666667,"Batman: The Dark Knight Returns (The Dark Knight Saga, #1)",136853,Goodreads,,batman the dark knight returns the dark knight saga 1,6.6e-05,0.000307
3845,12985,3,4.666667,The Tempest,127830,Goodreads,,the tempest,7e-05,0.000329
27120,6689,3,4.333333,James and the Giant Peach,276439,Goodreads,,james and the giant peach,3.3e-05,0.000141
8817,17881,3,4.333333,Notes from Underground & The Double,4200,Goodreads,,notes from underground the double,0.002143,0.009286
10134,1885,6,4.333333,Pride and Prejudice,2078406,Goodreads,,pride and prejudice,1.7e-05,7.5e-05
27091,6678,3,4.333333,Going Solo,14432,Goodreads,,going solo,0.000624,0.002702
12133,21976060,3,4.333333,The Art of Stillness: Adventures in Going Nowhere,2902,Goodreads,,the art of stillness adventures in going nowhere,0.003101,0.013439
16840,28348,3,4.333333,Crime and Punishment,6652,Goodreads,,crime and punishment,0.001353,0.005863
