# Book Search

In this notebook, we import our [data from Goodreads](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home) and stream it, line by line, extracting just the columns we're interested in. We then process the titles of all the books in our dataset, and implement a matrix of [TF-IDF features](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). With this matrix, we can then query our dataset for a given book, and see the top 5 results. Finally, we manually add the `book_id` of our queried book to our list, `liked_books`, which we will then use in the Recommendation notebook.

# Import Libraries & Tools

In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# read a single line from the .json file
with gzip.open('goodreads_books.json.gz', 'r') as f:
    line = f.readline()

#example of information in our .json.gz file
json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [3]:
#function to grab columns of interest to us
def parse_fields(line):
    data = json.loads(line)
    return {
        'book_id': data['book_id'],
        'title': data['title_without_series'],
        'ratings': data['ratings_count'],
        'url': data['url'],
        'cover_image': data['image_url']        
    }

#streams every line of .json file and appends to list if the book has > 15 ratings
books_titles = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
            
        fields = parse_fields(line)
        
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
            
        if ratings > 15:
            books_titles.append(fields)

#minor data cleaning
titles = pd.DataFrame.from_dict(books_titles)
titles['ratings'] = pd.to_numeric(titles['ratings'])
titles['mod_title'] = (
    titles['title'].str.replace('[^a-zA-Z0-9 ]', '', regex=True)
    .str.lower()
    .str.replace('\s+', ' ', regex=True)
)

#removes any entries whose mod_titles are blank
titles = titles[titles['mod_title'].str.len() > 0]
titles.to_json('books_titles.json')
titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook


# Functions

In [4]:
#instantiate vectorizer to create TF matrix and calculate inverse document frequency
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(titles['mod_title'])

#function to return query results
def search(query, vectorizer):
    processed = re.sub('[^a-zA-Z0-9 ]', '', query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = (
        titles.iloc[indices]
        .sort_values('ratings', ascending=False)
    )
    return results.head(10).style.format({'url': make_clickable, 'cover_image': show_image})

#turns URL into clickable link
def make_clickable(val):
    return f'<a target="_blank" href="{val}">Goodreads</a>'

#shows image preview of book cover
def show_image(val):
    return f'<img src="{val}" width=50></img>'    

# Query

In [5]:
#search for book, and add book_id to liked_books
search('The Rings of Saturn', vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1104962,434903,The Rings of Saturn,5829,Goodreads,,the rings of saturn
264364,25815,The Rings of Saturn,320,Goodreads,,the rings of saturn
723595,859890,The Rings of Saturn,76,Goodreads,,the rings of saturn
278561,29082615,The Rings of Saturn,53,Goodreads,,the rings of saturn
789424,1402760,"The Rings of Saturn (Time Machine, #6)",49,Goodreads,,the rings of saturn time machine 6
603092,11476080,The Rings of Saturn,49,Goodreads,,the rings of saturn
261066,1075043,The Rings of Saturn,48,Goodreads,,the rings of saturn
160797,204772,Saturn,40,Goodreads,,saturn
252421,1402759,The Rings Of Saturn,21,Goodreads,,the rings of saturn
752970,2620207,Saturn,16,Goodreads,,saturn


In [6]:
#list of book_ids to use in the recommendations notebook
liked_books = ['434903', '11047557', '29983711', '12073240', '11297',
               '12058235', '7117831', '11909375', '12073240', '102927',
               '12044809', '108218', '97411', '11617647', '975562',
               '1271159', '14', '1168191', '27209485', '4929', '1334340',
               '10799', '59950', '11275', '7718']

#manually 
book_ratings = [5, 5, 5, 5, 5, 5, 5, 4, 5, 4, 4, 4, 4, 4, 3, 4,
                3, 4, 5, 4, 5, 4, 3, 5, 4]

#function to grab columns of interest to us
def get_fields(bid):
    return {
        'user_id': -1, #-1 is me
        'book_id': titles[titles['book_id'] == bid].iloc[0,0],
        'rating': 0, #rate every book 0 for now
        'title': titles[titles['book_id'] == bid].iloc[0,1]
    }

#create dataframe of my liked books to convert to .csv
my_books = []
for i in liked_books:
    fields = get_fields(i)
    my_books.append(fields)
    
#convert to dataframe, change ratings, then create .csv
my_liked_books = pd.DataFrame.from_dict(my_books)
my_liked_books['rating'] = book_ratings
my_liked_books.to_csv('liked_books.csv', index=False)