# What we will get?

**Get HTML of a book page**

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://www.bookdepository.com/Machine-Learning-Kevin-P-Murphy/9780262018029?ref=grid-view&qid=1602139258108&sr=1-1"
html = urlopen(url)
bs = BeautifulSoup(html.read(), "lxml")

**Name of the book**

In [2]:
bs.find("h1", {"itemprop":"name"}).get_text()

'Machine Learning : A Probabilistic Perspective'

**authors**

In [3]:
authors = bs.find_all("span", {"itemprop":"author"})
for author in authors:
    print(author.get_text().strip())

Kevin P. Murphy


**Image of the book's cover**

In [4]:
from IPython.display import Image

item_image = bs.find("div", {"class":"item-img"})
img_url = item_image.find("img")["src"]
Image(url=img_url) 

**rating**

In [5]:
bs.find("span", {"itemprop":"ratingValue"}).get_text().strip()

'4.35'

**Number of pages**

In [6]:
bs.find("span", {"itemprop":'numberOfPages'})

<span itemprop="numberOfPages">1104 pages
</span>

**Other info: format, dimension, publisher....etc.** 

In [7]:
biblio_wrap = bs.find("div", {"class":'biblio-wrap'})
d = {}
for item in biblio_wrap.find_all("li"):
    label = item.label.get_text()
    info = item.span.get_text().strip().replace("\n", "")
    d[label] = info
    print(label + ": " + info)

Format: Hardback                                    | 1104 pages
Dimensions: 203                                    x 229                                    x 41mm                                                                    | 1,905g
Publication date: 18 Oct 2016
Publisher: MIT Press Ltd
Imprint: MIT Press
Publication City/Country: Cambridge, United States
Language: English
Illustrations note: 300 color illus., 165 b&w illus.; 465 Illustrations, unspecified
ISBN10: 0262018020
ISBN13: 9780262018029
Bestsellers rank: 41,210


In [8]:
bibli_info = ["Format", "Publication date", "Publisher", "Publication City/Country",
              "Dimensions", "Language", "ISBN10", "ISBN13"]
for i in bibli_info:
    print(d[i].strip().replace("\n", ""))

Hardback                                    | 1104 pages
18 Oct 2016
MIT Press Ltd
Cambridge, United States
203                                    x 229                                    x 41mm                                                                    | 1,905g
English
0262018020
9780262018029


**Price**

In [9]:
bs.find("span", {"class":"sale-price"})

# Wrap in a function

In [10]:
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
    

def get_book_info(url):
    
    # get the html page
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    except URLError as e:
        return None
    else:
        bs = BeautifulSoup(html.read(), "lxml")
    
    info_storage = {}
    
    # get book's name
    try:
        name = bs.find("h1", {"itemprop":"name"}).get_text()
    except:
        name = None
    info_storage["Name"] = name
    
    # get authors
    try:
        authors = bs.find_all("span", {"itemprop":"author"})
    except:
        author_list = None
    else:
        author_list = []
        for author in authors:
            try:
                a = author.get_text()
            except:
                continue
            else:
                if a:
                    author_list.append(a.strip())
                else:
                    author_list.append(a)
    info_storage["Authors"] = ", ".join(author_list)
    
    # get price
    try:
        price = bs.find("span", {"class":"sale-price"})
    except:
        price = None
    else:
        try:
            price = price.get_text()
        except:
            price = None
    info_storage["Price"] = price
        
    
    # get cover image url
    try:
        item_image = bs.find("div", {"class":"item-img"})
        img_url = item_image.find("img")
    except:
        img_url = None
    else:
        try:
            img_url = img_url["src"]
        except:
            img_url = None
    info_storage["Image-url"] = img_url
    
    # get rating
    try:
        rating = bs.find("span", {"itemprop":"ratingValue"})
    except:
        rating = None
    else:
        try:
            rating = rating.get_text()
        except:
            rating = None

        if rating:
            rating = rating.strip()
    info_storage["Rating"] = rating
    
    # get number of pages
    try:
        pages = bs.find("span", {"itemprop":'numberOfPages'})
    except:
        pages = None
    else:
        try:
            pages = pages.get_text()
        except:
            pages = None
        if pages:
            pages = pages.replace("\n", "")
    info_storage["NumberOfPages"] = pages
    
    # others
    bibli_info = ["Format", "Publication date", "Publisher", "Publication City/Country",
                  "Dimensions", "Language", "ISBN10", "ISBN13"]
    try:
        biblio_wrap = bs.find("div", {"class":'biblio-wrap'}).find_all('li')
    except:
        biblio_wrap = None
    else:
        labels = {}
        for tag in biblio_wrap:
            try:
                label = tag.label.get_text()
                info = tag.span.get_text()
            except:
                continue
            else:
                labels[label] = info
        
        for item in bibli_info:
            if item in labels:
                info_storage[item] = labels[item]
            else:
                info_storage[item] = None
            
    
    return info_storage

In [11]:
url = "https://www.bookdepository.com/Machine-Learning-Kevin-P-Murphy/9780262018029?ref=grid-view&qid=1602139258108&sr=1-1"
get_book_info(url)

{'Name': 'Machine Learning : A Probabilistic Perspective',
 'Authors': 'Kevin P. Murphy',
 'Price': None,
 'Image-url': 'https://d1w7fb2mkkr3kw.cloudfront.net/assets/images/book/mid/9780/2620/9780262018029.jpg',
 'Rating': '4.35',
 'NumberOfPages': '1104 pages',
 'Format': '\n                                Hardback\n                                    | 1104 pages\n\n',
 'Publication date': '18 Oct 2016',
 'Publisher': '\n\n\n                                        MIT Press Ltd\n\n',
 'Publication City/Country': '\n                                Cambridge, United States',
 'Dimensions': '\n                                203\n                                    x 229\n                                    x 41mm\n                                \n                                    | 1,905g\n                                ',
 'Language': '\n                                English',
 'ISBN10': '0262018020',
 'ISBN13': '9780262018029'}

# Find all Machine learning Book

## Find all url of book in a search page

In [12]:
def find_book_url_list(url):
    try:
        html = urlopen(url)
    except:
        return None
    else:
        bs = BeautifulSoup(html.read(), "lxml")
        
    front = "https://www.bookdepository.com"
    url_list = []
    try:
        tags = bs.find_all("div", {"class":"book-item"})
    except:
        return None
    else:
        for tag in tags:
            book_url = front + tag.find('a')["href"]
            url_list.append(book_url)
    return url_list

## Go through all the search result

In [13]:
bs.find("li", {"id":"next-top"}) # next page

In [14]:
from urllib import parse

# url with "%" will gives error, so I make a function to handle it.
def get_percent_encoded(url):
    scheme, netloc, path, query, fragment = parse.urlsplit(url)
    path = parse.quote(path)
    link = parse.urlunsplit((scheme, netloc, path, query, fragment))
    return link

def find_all_book(url):
    
    books = []
    errors = []
    front = "https://www.bookdepository.com"
    while url:
        urls = find_book_url_list(url)
        for i in urls:
            try:
                book = get_book_info(i)
            except UnicodeEncodeError:
                i = get_percent_encoded(i)
                book = get_book_info(i)
            except:
                errors.append(i)
                print(i)
            books.append(book)
            
        # next page
        html = urlopen(url)
        bs = BeautifulSoup(html.read(), "lxml")
        url = bs.find("li", {"id":"next-top"})
        
        if url:
            url = front + url.a["href"]

            
    return books, errors

**Hardback**

In [15]:
hard_url = "https://www.bookdepository.com/search?searchTerm=machine+learning+statistical+learning&searchSortBy=&category=&price=&availability=&searchLang=&format=2"
hardback, hardback_errors = find_all_book(hard_url)

In [16]:
#hardback_errors

In [17]:
hardback_errors

[]

**Paperback**

In [None]:
paper_url = "https://www.bookdepository.com/search?searchTerm=machine+learning+statistical+learning&searchSortBy=&category=&price=&availability=&searchLang=&format=1"
papeerback, paperback_errors = find_all_book(paper_url)

In [None]:
paperback_errors

In [None]:
books = hardback + papeerback

# Store data into a csv file

In [None]:
import csv

colnames = list(books[0].keys())

csv_file = "bookprice.csv"
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=colnames)
        writer.writeheader()
        for book in books:
            try:
                writer.writerow(book)
            except:
                continue
except IOError:
    print("I/O error")

In [None]:
import numpy as np
import pandas as pd

In [None]:
pd.read_csv("bookprice.csv").head()