In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import pickle

In [None]:
full_data = []

In [None]:
# function to extract data from a book's page on the website
def get_book_info(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    # text_stack = soup.find('div', {'id': 'description'})
    # text_stack.i.extract()

    data = {'title': soup.find('h1', {'class': 'gr-h1 gr-h1--serif'}).text.replace('\n', '').strip(),
            'author': soup.find('a', {'class': 'authorName'}).text.replace('\n', ' ').strip(),
            'summary': soup.find('span', {'id': lambda x: x and (x.startswith('freeText') and  not x.startswith('freeTextContainer'))}).text.replace('\n', ' '),
             'image': soup.find('img', {'id': 'coverImage'})['src']
            }
    return data

In [None]:
# get books from goodreads' best book ever list
for page in range(1,101):
    print("Processing page ",page)
    page = requests.get('https://www.goodreads.com/list/show/1.Best_Books_Ever?page={}'.format(page))
    soup = BeautifulSoup(page.content, 'html.parser')
    for book in soup.find_all('a', {'class': 'bookTitle'}):
        base_url = 'https://www.goodreads.com'
        url = base_url + book['href']
        try:
            full_data.append(get_book_info(url))
        except Exception as e:
            pass
print("Done")

Processing page  41
Processing page  42
Processing page  43
Processing page  44
Processing page  45
Processing page  46
Processing page  47
Processing page  48
Processing page  49
Processing page  50
Processing page  51
Processing page  52
Processing page  53
Processing page  54
Processing page  55
Processing page  56
Processing page  57
Processing page  58
Processing page  59
Processing page  60


In [None]:
# Save raw data
with open('raw_full.pkl', 'wb') as f:
    pickle.dump(full_data, f)

In [None]:
full_data[1]

{'title': 'To Kill a Mockingbird',
 'author': 'Harper Lee',
 'summary': 'The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, "To Kill A Mockingbird" takes readers to the roots of human behavior - to innocence and experience, kindness and cruelty, love and hatred, humor and pathos. Now with over 18 million copies in print and translated into forty languages, this regional story by a young Alabama woman claims universal appeal. Harper Lee always considered her book to be a simple love story. Today it is regarded as a masterpiece of American literature.',
 'image': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1553383690l/2657.jpg'}

In [None]:
# Load raw data
with open('raw_full.pkl', 'rb') as f:
  clean_full = pickle.load(f)

In [None]:
raw_full[-1]

{'title': 'Far from the Tree',
 'author': 'Robin Benway',
 'summary': 'Being the middle child has its ups and downs.But for Grace, an only child who was adopted at birth, discovering that she is a middle child is a different ride altogether. After putting her own baby up for adoption, she goes looking for her biological family, including—Maya, her loudmouthed younger bio sister, who has a lot to say about their newfound family ties. Having grown up the snarky brunette in a house full of chipper redheads, she’s quick to search for traces of herself among these not-quite-strangers. And when her adopted family’s long-buried problems begin to explode to the surface, Maya can’t help but wonder where exactly it is that she belongs.And Joaquin, their stoic older bio brother, who has no interest in bonding over their shared biological mother. After seventeen years in the foster care system, he’s learned that there are no heroes, and secrets and fears are best kept close to the vest, where they

## Data cleaning

In [None]:
clean_full = []

In [None]:
# Remove librarian notes
for i in range(len(raw_full)):
  if (('Librarian\'s note:' in raw_full[i]['summary']) or ('Librarian note:' in raw_full[i]['summary'])):
    raw_full[i]['summary'] = '.'.join(raw_full[i]['summary'].split('.')[1:]).strip()

In [None]:
# Remove (back cover)
for i in range(len(raw_full)):
  if ('(back cover' in raw_full[i]['summary']):
    raw_full[i]['summary'] = '.'.join(raw_full[i]['summary'].split('.')[:-1]).strip()

In [None]:
raw_full[8]

{'title': 'The Perks of Being a Wallflower',
 'author': 'Stephen Chbosky',
 'summary': "standing on the fringes of life...offers a unique perspective. But there comes a time to seewhat it looks like from the dance floor.This haunting novel about the dilemma of passivity vs. passion marks the stunning debut of a provocative new voice in contemporary fiction: The Perks of Being A WALLFLOWERThis is the story of what it's like to grow up in high school. More intimate than a diary, Charlie's letters are singular and unique, hilarious and devastating. We may not know where he lives. We may not know to whom he is writing. All we know is the world he shares. Caught between trying to live his life and trying to run from it puts him on a strange course through uncharted territory. The world of first dates and mixed tapes, family dramas and new friends. The world of sex, drugs, and The Rocky Horror Picture Show, when all one requires is that the perfect song on that perfect drive to feel infinite

In [None]:
# Save cleaned data
import pickle

with open('clean_full.pkl', 'wb') as f:
    pickle.dump(raw_full, f)

## Encode description of books using tensorflow

In [None]:
with open('clean_full.pkl', 'rb') as f:
  clean_full = pickle.load(f)
len(clean_full)

4989

In [None]:
import tensorflow_hub as hub
import tensorflow_text

In [None]:
embedding_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [None]:
# add vectors to the data
for element in clean_full:
    element['vector'] = embedding_model(element['summary'])[0]

In [None]:
with open('clean_full_embedded.pkl', 'wb') as f:
    pickle.dump(clean_full, f)

## Nearest neighbors with Cosine similarity

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectors = [item['vector'] for item in clean_full]
X = np.array(vectors)

In [None]:
# calculate similarity based on cosine distance
cos_sim = cosine_similarity(X)
cos_indices = np.vstack([np.argsort(-arr) for arr in cos_sim])

In [None]:
# find most similar books for each case
for i, book in enumerate(clean_full):
    book['cosine'] = cos_indices[i][1:21]

In [None]:
# remove vectors from dict
for book in clean_full:
    book.pop('vector')

In [None]:
clean_full[29]

{'title': 'Harry Potter and the Prisoner of Azkaban',
 'author': 'J.K. Rowling',
 'summary': "Harry Potter, along with his best friends, Ron and Hermione, is about to start his third year at Hogwarts School of Witchcraft and Wizardry. Harry can't wait to get back to school after the summer holidays. (Who wouldn't if they lived with the horrible Dursleys?) But when Harry gets to Hogwarts, the atmosphere is tense. There's an escaped mass murderer on the loose, and the sinister prison guards of Azkaban have been called in to guard the school...",
 'image': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1630547330l/5._SY475_.jpg',
 'cosine': array([  50,   44, 3320, 3336,  828, 4564, 4072, 1200, 2491, 1453, 2322,
        1220, 3155, 4183,   69, 2716, 4780, 4239, 4601, 3897])}

In [None]:
# save the data
import pickle

with open('clean_full_cosine.pkl', 'wb') as f:
    pickle.dump(clean_full, f)

In [None]:
# Get recommendation
for i in clean_full[29]['cosine']:
  print(clean_full[i]['title'])

Harry Potter and the Chamber of Secrets
Harry Potter and the Half-Blood Prince
Harry Potter Boxed Set, Books 1-5 (Harry Potter, #1-5)
James Potter and the Hall of Elders' Crossing
The Harry Potter Collection 1-4
Harry Potter: Film Wizardry
The Last Coyote
Dead Beat
Small Favor
Fantastic Beasts and Where to Find Them: The Original Screenplay
White Night
Summer Knight
So You Want to Be a Wizard
Phantom
The Battle of the Labyrinth
The Leopard
The Soulforge
DragonQuest
The Heaven Tree Trilogy
The Wiccan Diaries
