### Importing Libraries

In [1]:
import requests 
from bs4 import BeautifulSoup 
from selenium import webdriver
from PIL import Image
import os
import tempfile
from pymongo import MongoClient
import gridfs

### Creating a temporary folder in tempdir

In [3]:
"""
Creating a folder in the tempdir to temporarily store the image data scraped from the amazon.ca website.
"""
temp_path = tempfile.gettempdir()+'\\amazon_scrapper'
if not os.path.exists(temp_path):
    os.makedirs(temp_path)

### Creating a connection to mongodb

In [4]:
"""
Creating a connection to mongodb cluster and deleting any already existing data in the database.
"""
CONN_STR = 'mongodb+srv://navdb:nav123@cluster0.dnpkj.mongodb.net/?retryWrites=true&w=majority'
client = MongoClient(CONN_STR)
client['smd_amazon_db'].drop_collection('smd_amazon_col')
client['smd_amazon_img_db'].drop_collection('fs.chunks')
client['smd_amazon_img_db'].drop_collection('fs.files')

{'operationTime': Timestamp(1611687272, 1),
 'ok': 0.0,
 'errmsg': 'ns not found',
 'code': 26,
 'codeName': 'NamespaceNotFound',
 '$clusterTime': {'clusterTime': Timestamp(1611687272, 1),
  'signature': {'hash': b'\xfc\xe8\x9d\xe4\x14\xda3%\xdcc\x88\xf0\x84\xdb\xdf\xe7\xc6\xf8h4',
   'keyId': 6898742217918119939}}}

In [5]:
# data = fs.find({"filename":"book_1.jpg"}).next().read()
# with open('book_1.jpg','wb') as op:
#     op.write(data)

### Scraping data from amazon.ca

In [6]:
URL = 'https://www.amazon.ca/Best-Sellers-Books/zgbs/books/ref=zg_bs_pg_2?_encoding=UTF8&pg={}'

In [7]:
def save_image(img_src, book_rank):
    """
    This method stores the image data scraped from the amazon.ca website in a mongodb database named 'smd_amazon_img_db'.
    """
    img_db = client['smd_amazon_img_db']
    fs = gridfs.GridFS(img_db)
    img_name = 'book_'+ book_rank + '.jpg'
    img = Image.open(requests.get(img_src, stream=True).raw)
    img.save(temp_path+'\\'+img_name)
    with open(temp_path+'\\'+img_name, 'rb') as img_data:
        fs.put(img_data, content_type=img.get_format_mimetype(),filename=img_name)
    return img_name

In [10]:
def get_book_list(page_source, book_list = [], rank= 1):
    """
    The get_book_list method uses BeautifulSoup to scrape the data about bestselling books on amazon.ca and returns 
    the book_list containing the list of bestselling books on amazon
    """
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    olist = soup.find_all('ol', {'id': 'zg-ordered-list'})[0]
    list_items = olist.find_all('li', {'class': 'zg-item-immersion'})
    for item in list_items:
        book = {}
        book['rank'] = rank
        block = item.find_all('span', {'class': 'aok-inline-block zg-item'})[0]
        title = block.find_all('div', {'class': 'p13n-sc-truncate-desktop-type2 p13n-sc-truncated'})[0]
        book['title'] = title.text
        rows = block.find_all('div', {'class': 'a-row'})
        book['author'] = rows[0].contents[0].text
        book['format'] = rows[1].contents[0].text
        book['price'] = block.find_all('span', {'class': 'p13n-sc-price'})[0].text
        icon_rows = block.find_all('div', {'class': 'a-icon-row'})
        for ir in icon_rows:
            book['rating'] = ir.find_all('a')[0].text.strip()
            book['num_of_reviews'] = ir.find_all('a')[1].text.strip()
        images = block.find_all('img')[0]
        book['img_name'] = save_image(images.get('src'), str(book['rank']))
        book_list.append(book)
        rank +=1
    return book_list, rank

In [11]:
"""
1) Using chrome webdriver to access the amazon.ca website and scrape the data from the website's page source.
2) The get_book_list method returns the list of bestselling books which can be stored in a mongodb database.
"""
driver = webdriver.Chrome('E:\\chromedriver\\chromedriver.exe')
driver.get(URL.format(1))
book_list, rank = get_book_list(driver.page_source)
driver.get(URL.format(2))
book_list, rank = get_book_list(driver.page_source, book_list, rank)

### Saving the scraped data to mongodb

In [12]:
"""
Creating the 'smd_amazon_col' collection in the 'smd_amazon_db' database and inserting the contents of the booklist into 
the 'smd_amazon_col' collection in mongodb.
"""
amazon_db = client['smd_amazon_db']
amazon_col = amazon_db['smd_amazon_col']
amazon_col.insert_many(book_list)

<pymongo.results.InsertManyResult at 0x254ee52a9c8>

### Deleting the temporary directory

In [14]:
"""
Deleting the images that are temporarily stored in the tempdir.
"""
filelist = [ f for f in os.listdir(temp_path) if f.endswith(".jpg") ]
for f in filelist:
    os.remove(os.path.join(temp_path, f))