In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
# function to extract data from a book's page on the website
def get_book_info(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    data = {'title': soup.find('h1', {'class': 'book-title'}).text,
            'author': soup.find('a', {'class': 'book-meta-author-name'}).text.replace('\n', ''),
            'ISBN':  soup.find('div', {'id': 'sel-buy-box'}).find('span', {'class': 'buy-box--isbn'}).text[6:],
            'summary': soup.find('div', {'class': 'book-summary'}).text.replace('\n', ''),
            'image': soup.find('img', {'class': 'book-image'})['src']}
    return data

In [None]:
'''# get data about most popular books on the website
full_data = []
for page in range(1000):
    print(page)
    page = requests.get('https://www.leslibraires.ca/categorie/livres-anglais-ANG/?tri=plus-populaires&ic=25&p={}'.format(page))
    soup = BeautifulSoup(page.content, 'html.parser')
    for book in soup.find_all('a', {'class': 'book-title'}):
        base_url = 'https://www.leslibraires.ca'
        url = base_url + book['href']
        try:
            full_data.append(get_book_info(url))
        except Exception as e:
            print(e)
'''

In [None]:
# %store -r full_data

In [4]:
len(full_data)

10757

In [5]:
full_data[0]

{'title': 'Venus in the Blind Spot',
 'author': 'Junji Ito',
 'ISBN': '9781974715473',
 'summary': 'A "best of" collection of creepy tales from Eisner award winner and legendary horror master Junji Ito.This ultimate collection presents the most remarkable short works of Junji Ito’s career, featuring an adaptation of Rampo Edogawa’s classic horror story "Human Chair" and fan favorite "The Enigma of Amigara Fault." In a deluxe presentation with special color pages and color illustrations from his most recent long-form manga No Longer Human, every page invites readers to revel in a world of terror.',
 'image': '//images.leslibraires.ca/books/9781974715473/front/9781974715473_medium.jpg'}

In [6]:
# remove books appearing more than once
book_data_no_duplicates = []
titles = []
for book in full_data:
    if book['title'] not in titles:
        titles.append(book['title'])
        book_data_no_duplicates.append(book)
        
full_data = book_data_no_duplicates

In [7]:
len(full_data)

10101

# Encode description of books using tensorflow

In [8]:
!pip install tensorflow_hub
!pip install tensorflow-text



In [9]:
import tensorflow_hub as hub
import tensorflow_text 

C:\Users\hp\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\hp\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


tensorflow_text module is used for finding the similarity matrix based on any multilingual sentences

In [10]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [11]:
# add vectors to the data
for element in full_data:
    element['vector'] = embed(element['summary'])[0]

# Make nearest neighbor models using Cosine Similarity

In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
vectors = [item['vector'] for item in full_data]
X = np.array(vectors)

In [14]:
# calculate similarity based on cosine distance
cos_sim = cosine_similarity(X)
cos_indices = np.vstack([np.argsort(-arr) for arr in cos_sim])

In [15]:
# find most similar books for each case
for i, book in enumerate(full_data):
    book['cosine'] = cos_indices[i][1:21]

In [16]:
# remove vectors from dict
for book in full_data:
    book.pop('vector')

In [17]:
full_data[0]

{'title': 'Venus in the Blind Spot',
 'author': 'Junji Ito',
 'ISBN': '9781974715473',
 'summary': 'A "best of" collection of creepy tales from Eisner award winner and legendary horror master Junji Ito.This ultimate collection presents the most remarkable short works of Junji Ito’s career, featuring an adaptation of Rampo Edogawa’s classic horror story "Human Chair" and fan favorite "The Enigma of Amigara Fault." In a deluxe presentation with special color pages and color illustrations from his most recent long-form manga No Longer Human, every page invites readers to revel in a world of terror.',
 'image': '//images.leslibraires.ca/books/9781974715473/front/9781974715473_medium.jpg',
 'cosine': array([6927,  164, 7410, 7513, 7032, 6604, 6930, 1219, 7397, 9810, 2070,
        9417, 2387,  664, 3646,  399, 2008, 7044, 5765, 4995], dtype=int64)}

In [18]:
import pickle

with open('books.pkl', 'wb') as f:
    pickle.dump(full_data, f)

%store full_data