In [8]:
import gzip
import json
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time

In [None]:
'''
    Only run this cell if you haven't created ./data/books_title.json. Uncomment to run

def parse_field(line):
    data = json.loads(line)
    return {
        'book_id': data['book_id'],
        'title': data['title_without_series'],
        'ratings': data['ratings_count'],
        'cover_image': data['image_url']
    }

books_title = []

with gzip.open('./data/goodreads_books.json.gz', 'r') as f:
    while True:
        line = f.readline()
        if not line:    #break look at EOF
            break
        fields = parse_field(line)

        try:
            ratings = int(fields['ratings']) # try to convert to integer
        except ValueError:
            continue

        if ratings > 10:   # Only take books with more than 10 ratings
            books_title.append(fields)

b_titles = pd.DataFrame.from_dict(books_title)
b_titles["ratings"] = pd.to_numeric(b_titles["ratings"])

b_titles["mod_title"] = b_titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex = True)
b_titles["mod_title"] = b_titles["mod_title"].str.lower()
b_titles["mod_title"] = b_titles["mod_title"].str.replace("\s+", " ", regex = True)
b_titles = b_titles[b_titles["mod_title"].str.len() > 0]

'''

In [9]:
print ('Reading books_title.json ...')
books_titles = pd.read_json('./data/books_title.json')
books_titles['book_id'] = books_titles['book_id'].astype(str)

print ('Applying TFIDF to books_title.json ...')
start = time.time()
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(books_titles["mod_title"])
print ('Vectorizing Completed in: {:.2f} sec'.format(time.time()-start))
print ('')

def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = books_titles.iloc[indices]   # gives lists of books that are most similar to the search query
    results = results.sort_values('ratings', ascending = False)
    return results.head(5).style.format({'cover_image':show_img})

def show_img(x):
    return '<img src="{}" width=50></img>'.format(x)

Reading books_title.json ...
Applying TFIDF to books_title.json ...
Vectorizing Completed in: 11.25 sec



In [2]:
'''
Need to know book_id to get your collection of liked books.
You can run this cell as many times as you like and note the corresponding book_id
'''

# Enter Search Query
query = input('Enter the name of a Book you like:')
search(query, vectorizer)

Unnamed: 0,book_id,title,ratings,cover_image,mod_title
498918,28587609,"Pull Me Close (The Panic, #1)",1057,,pull me close the panic 1
498916,12373920,Small Things with Great Love: Adventures in Loving Your Neighbor,162,,small things with great love adventures in loving your neighbor
498919,652851,"So This Is Christmas (Includes: The Protectors, #19)",122,,so this is christmas includes the protectors 19
498921,652855,Eddie and the Cruisers,117,,eddie and the cruisers
1496811,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,,101 nights volume one 101 nights 13


In [3]:
''' 
    After searching for book_id, make a list of them.
    e.g. liked_books = ["3869", "11047557", "325160", "17984418", "464164"]
    
'''
liked_books = ["3869", "11047557", "325160", "17984418", "464164"]

In [4]:
'''
Because goodreads_interactions.csv and goodreads_books.json.gz (in this case books_title.json) 
have different bookID; we'll need to map them using book_id_map.csv
'''
csv_book_mapping = {}
with open('./data/book_id_map.csv', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

print ('Number of keys: {}'.format(len(csv_book_mapping)))

Number of keys: 2360651


In [5]:
print ('Getting a set of unique users with similar taste as yours...')
print ('This process takes a while')
print ('===========================')

start = time.time()
unique_users = set()  # Will contain unique users who read the same books as us and rated highly (4 star)
with open("./data/goodreads_interactions.csv",'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        if user_id in unique_users:
            continue
        try:
            rating = int(rating)
        except ValueError:
            continue
        
        book_id = csv_book_mapping[csv_id]

        if book_id in liked_books and rating >=4:
            unique_users.add(user_id)

print ('')
print ('Finished getting unique users in: {:.2f} min'.format((time.time()-start)/60.0))

start1 = time.time()
print ('Finding books read by those users...')
# Finding what books those user read
rec_lines = [] # will only contain books that users who liked the same books as us have read i.e. it contains all potential books we wanna read
with open("./data/goodreads_interactions.csv",'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        if user_id in unique_users:
            book_id = csv_book_mapping[csv_id]
            rec_lines.append([user_id, book_id, rating])

print ('Finished getting books read by these unique users in {:.2f} min'.format((time.time()-start1)/60.0))
print ('===========================')
print ('\nTotal users who read the same book as you and rated 4 star or above: {}'.format(len(unique_users)))
print ('Found {} books these unique users rated 4 star or more'.format(len(rec_lines)))

Getting a set of unique users with similar taste as yours...
This process takes a while

Finished getting unique users in: 4.73 min
Finding books read by those users...
Finished getting books read by these unique users in 2.12 min

Total users who read the same book as you and rated 4 star or above: 13159
Found 6927892 books these unique users rated 4 star or more


In [6]:
'''
    Create recommendations
'''
recs = pd.DataFrame(rec_lines, columns = ['user_id', 'book_id', 'rating'])
recs['book_id'] = recs['book_id'].astype(str)

# Get top 10 recommendations out of all the recommendations above
top_recs = recs['book_id'].value_counts().head(10)
top_recs = top_recs.index.values

# Initial Recommendation
books_titles[books_titles['book_id'].isin(top_recs)]

all_recs = recs['book_id'].value_counts() # counts how many times each book appeared in our set
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ['book_id', 'book_count']
all_recs = all_recs.merge(books_titles, how = 'inner', on = 'book_id')

# Add score column to the data frame (high score == more likely that you'll read)
'''
    Book count: Of all the users who like the same books as us, how many of them like a given book.
    (all_recs['book_count'] / all_recs['ratings']): If a book is very popular in our set and less popular in GoodReads its going to be more highly recommended to us.
'''
all_recs['score'] = all_recs['book_count'] * (all_recs['book_count'] / all_recs['ratings'])
all_recs.sort_values('score', ascending = False).head(10)

# Of course the all these recommendations may have less # book count so...
popular_recommendations = all_recs[all_recs['book_count'] > 100].sort_values('score', ascending = False)

In [7]:
# Show Recommendations:
popular_recommendations[~popular_recommendations['book_id'].isin(liked_books)].head(10).style.format({'cover_image':show_img})

Unnamed: 0,book_id,book_count,title,ratings,cover_image,mod_title,score
3840,24909347,195,"Obsidio (The Illuminae Files, #3)",82,,obsidio the illuminae files 3,463.719512
74,224912,2646,"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",22635,,harry potter and the chamber of secrets harry potter 2,309.313718
8169,26856502,104,"Vengeful (Villains, #2)",35,,vengeful villains 2,309.028571
6509,29749098,125,"Catwoman: Soulstealer (DC Icons, #3)",73,,catwoman soulstealer dc icons 3,214.041096
143,93124,1951,"Harry Potter and the Half-Blood Prince (Harry Potter, #6)",21235,,harry potter and the halfblood prince harry potter 6,179.251283
105,72193,2345,"Harry Potter and the Philosopher's Stone (Harry Potter, #1)",31614,,harry potter and the philosophers stone harry potter 1,173.942715
288,864890,1345,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",11031,,harry potter and the goblet of fire harry potter 4,163.994651
225,43509,1576,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",16347,,harry potter and the goblet of fire harry potter 4,151.940784
1131,21032488,530,"Doors of Stone (The Kingkiller Chronicle, #3)",2059,,doors of stone the kingkiller chronicle 3,136.425449
7549,28170940,112,"Lethal White (Cormoran Strike, #4)",106,,lethal white cormoran strike 4,118.339623
