# Import Data

In [1]:
import pandas as pd

#list of our liked books from the search.ipynb
liked_books = ['434903', '11047557', '29983711', '12073240', '11297', '12058235', '7117831', '11909375']

#dict to hold book_id's 
csv_book_mapping = {}

#stream the .csv to add book_ids in csv_book_mapping dict
with open('book_id_map.csv', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(',')
        csv_book_mapping[csv_id] = book_id

#set of users who have liked a book in our list and rated it 4 or higher
overlap_users = set()

#stream the .csv to get set of users who liked same books and gave it 4+ rating
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            continue

        try:
            rating = int(rating)
        except ValueError:
            continue
        
        book_id = csv_book_mapping[csv_id]
        
        if book_id in liked_books and rating >= 4:
            overlap_users.add(user_id)

In [2]:
#list of books that users with similar interests have rated as 4 or higher
rec_lines = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            book_id = csv_book_mapping[csv_id]
            rec_lines.append([user_id, book_id, rating])

# Data Exploration

In [3]:
print(f'Books in dataset: {len(csv_book_mapping)}')
print(f'Users with similar tastes: {len(overlap_users)}')
print(f'Books rated > 4 from those users: {len(rec_lines)}')

Books in dataset: 2360651
Users with similar tastes: 15862
Books rated > 4 from those users: 6896327


In [4]:
#create dataframe of initial recommended books
recs = pd.DataFrame(rec_lines, columns=['user_id', 'book_id', 'rating'])
recs['book_id'] = recs['book_id'].astype(str)
recs

Unnamed: 0,user_id,book_id,rating
0,32,14050,5
1,32,960,3
2,32,1618,3
3,32,77203,4
4,32,4214,4
...,...,...,...
6896322,876091,13214,0
6896323,876091,278216,4
6896324,876091,7815,0
6896325,876091,1292187,0


In [5]:
#top 10 recommendations based on count
top_recs = recs['book_id'].value_counts().head(10)
top_recs = top_recs.index.values

#read in book titles to match with book_id
books_titles = pd.read_json('books_titles.json')
books_titles['book_id'] = books_titles['book_id'].astype(str)
books_titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook


In [6]:
#gets book titles whose id is in our top_recs
books_titles[books_titles['book_id'].isin(top_recs)]

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
138759,11297,Norwegian Wood,189394,https://www.goodreads.com/book/show/11297.Norw...,https://s.gr-assets.com/assets/nophoto/book/11...,norwegian wood
284473,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
401395,5107,The Catcher in the Rye,2086945,https://www.goodreads.com/book/show/5107.The_C...,https://images.gr-assets.com/books/1398034300m...,the catcher in the rye
463463,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
615314,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
770177,7613,Animal Farm,1928931,https://www.goodreads.com/book/show/7613.Anima...,https://images.gr-assets.com/books/1424037542m...,animal farm
790927,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
851581,4929,Kafka on the Shore,172711,https://www.goodreads.com/book/show/4929.Kafka...,https://s.gr-assets.com/assets/nophoto/book/11...,kafka on the shore
876816,865,The Alchemist,1342863,https://www.goodreads.com/book/show/865.The_Al...,https://images.gr-assets.com/books/1483412266m...,the alchemist
878545,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...


In [7]:
#create dataframe of all recommended books, based on book_count
all_recs = recs['book_id'].value_counts()
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ['book_id', 'book_count']
all_recs

Unnamed: 0,book_id,book_count
0,11297,14303
1,4671,8989
2,5470,8774
3,5107,8589
4,2657,8363
...,...,...
743176,7260482,1
743177,715815,1
743178,1961548,1
743179,434015,1


In [8]:
#combines all_recs with books_titles
all_recs = all_recs.merge(books_titles, how='inner', on='book_id')

#creates 'score' where books are penalized if they are popular in the general sense
#this helps find books that aren't insanely popular and recommended to everyone
all_recs['score'] = all_recs['book_count'] * (all_recs['book_count'] / all_recs['ratings'])
all_recs.sort_values('score', ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
0,11297,14303,Norwegian Wood,189394,https://www.goodreads.com/book/show/11297.Norw...,https://s.gr-assets.com/assets/nophoto/book/11...,norwegian wood,1080.159926
5,4929,8096,Kafka on the Shore,172711,https://www.goodreads.com/book/show/4929.Kafka...,https://s.gr-assets.com/assets/nophoto/book/11...,kafka on the shore,379.508057
39,9557,4783,Sputnik Sweetheart,63880,https://www.goodreads.com/book/show/9557.Sputn...,https://images.gr-assets.com/books/1508082438m...,sputnik sweetheart,358.126002
12,10357575,6315,1Q84,129101,https://www.goodreads.com/book/show/10357575-1q84,https://images.gr-assets.com/books/1483103331m...,1q84,308.899428
10,11275,6469,The Wind-Up Bird Chronicle,136559,https://www.goodreads.com/book/show/11275.The_...,https://images.gr-assets.com/books/1327872639m...,the windup bird chronicle,306.446012
67,17799,3755,"South of the Border, West of the Sun",48978,https://www.goodreads.com/book/show/17799.Sout...,https://images.gr-assets.com/books/1443685506m...,south of the border west of the sun,287.884867
52,17803,4267,After Dark,65830,https://www.goodreads.com/book/show/17803.Afte...,https://s.gr-assets.com/assets/nophoto/book/11...,after dark,276.580419
79,17800,3368,"Dance Dance Dance (The Rat, #4)",42021,https://www.goodreads.com/book/show/17800.Danc...,https://images.gr-assets.com/books/1443689019m...,dance dance dance the rat 4,269.946551
348,29983711,1472,Pachinko,8161,https://www.goodreads.com/book/show/29983711-p...,https://images.gr-assets.com/books/1462393298m...,pachinko,265.504718
57,10374,4094,Hard-Boiled Wonderland and the End of the World,65374,https://www.goodreads.com/book/show/10374.Hard...,https://images.gr-assets.com/books/1399844477m...,hardboiled wonderland and the end of the world,256.383822


# Get Recommendations

In [9]:
popular_recs = all_recs[all_recs['book_count'] > 200].sort_values('score', ascending=False)

def make_clickable(val):
    return f'<a target="_blank" href="{val}">Goodreads</a>'

def show_image(val):
    return f'<img src="{val}" width=50></img>'

popular_recs[~popular_recs['book_id'].isin(liked_books)].head(10).style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
5,4929,8096,Kafka on the Shore,172711,Goodreads,,kafka on the shore,379.508057
39,9557,4783,Sputnik Sweetheart,63880,Goodreads,,sputnik sweetheart,358.126002
12,10357575,6315,1Q84,129101,Goodreads,,1q84,308.899428
10,11275,6469,The Wind-Up Bird Chronicle,136559,Goodreads,,the windup bird chronicle,306.446012
67,17799,3755,"South of the Border, West of the Sun",48978,Goodreads,,south of the border west of the sun,287.884867
52,17803,4267,After Dark,65830,Goodreads,,after dark,276.580419
79,17800,3368,"Dance Dance Dance (The Rat, #4)",42021,Goodreads,,dance dance dance the rat 4,269.946551
57,10374,4094,Hard-Boiled Wonderland and the End of the World,65374,Goodreads,,hardboiled wonderland and the end of the world,256.383822
1214,11296,568,Haruki Murakami and the Music of Words,1265,Goodreads,,haruki murakami and the music of words,255.038735
70,11298,3672,"A Wild Sheep Chase (The Rat, #3)",56601,Goodreads,,a wild sheep chase the rat 3,238.221657
