In [1]:
import requests                            # For making request to websites
from bs4 import BeautifulSoup              # For parsing HTML
import pandas as pd                        # To store data in a data frame

In [2]:
headers = {"Accept-Language": "en-US, en;q=0.6, en-GB;q=0.5"}

*`Accept-Language`* request HTTP header tells us which (natural) language the user is able to understand.

**Relative quality factor**: 
Preference of language by the user, on a scale of 0 to 1 <br>
*`en-US, en;q=0.6, en-GB;q=0.5`* means American English is preferres, but will accept other types of English and British English.

https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language

In [3]:
# URL of first page: "http://books.toscrape.com/catalogue/page-1.html"

url_head = "http://books.toscrape.com/catalogue/page-"
url_tail = ".html"

title = []
price = []
stock = []
rating = []
page_no = []

for i in range(1,51):
    
    url = url_head + str(i) + url_tail
    page = requests.get(url, headers = headers)
    soup = BeautifulSoup(page.text, "html.parser")
        
    book_li = soup.find_all("li", attrs = {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})
    # Used to store details of all "li" tags, which houses the details of each book in the catalog
    
    for book in book_li:
    
        # Extracting title of the book
        name = book.article.h3.a.text
        title.append(name)

        #Extracting the tag that houses both price and stock availability of each book
        product = book.article.find("div", attrs = {"class": "product_price"})

        # Extracting the cost of the book
        cost = product.find("p", {"class": "price_color"}).text
        price.append(cost[2:]) # Removing the euro sign in front of each price

        # Extracting the stock status of the book 
        available = product.find("p", {"class": "instock availability"}).text
        stock.append(available.lstrip().rstrip()) # Removing white spaces surrounding the text

        # Extracting the rating of the book
        rating_class_attrs = book.article.find("p", {"class": "star-rating"}).attrs
        if ("One" in rating_class_attrs["class"]):
            stars = 1
        elif ("Two" in rating_class_attrs["class"]):
            stars = 2
        elif ("Three" in rating_class_attrs["class"]):
            stars = 3
        elif ("Four" in rating_class_attrs["class"]):
            stars = 4
        elif ("Five" in rating_class_attrs["class"]):
            stars = 5
        rating.append(stars)
        
        # Entering the webpage number in which the book is present
        page_no.append(i)

In [4]:
# Creating a data frame to store the data in a tabular form
books = pd.DataFrame({
    "Title": title,
    "Price (in Euros)": price,
    "Availability": stock,
    "5 Star Rating": rating,
    "Webpage Number": page_no
})

In [5]:
# Changing the data type of "Price" to float
books["Price (in Euros)"] = books["Price (in Euros)"].astype(float)

In [6]:
print(books)

                                Title  Price (in Euros) Availability  \
0                  A Light in the ...             51.77     In stock   
1                  Tipping the Velvet             53.74     In stock   
2                          Soumission             50.10     In stock   
3                       Sharp Objects             47.82     In stock   
4        Sapiens: A Brief History ...             54.23     In stock   
..                                ...               ...          ...   
995  Alice in Wonderland (Alice's ...             55.53     In stock   
996    Ajin: Demi-Human, Volume 1 ...             57.06     In stock   
997         A Spy's Devotion (The ...             16.97     In stock   
998           1st to Die (Women's ...             53.98     In stock   
999           1,000 Places to See ...             26.08     In stock   

     5 Star Rating  Webpage Number  
0                3               1  
1                1               1  
2                1      

In [7]:
# Saving the resultant data frame in a CSV file named as "books_to_scrap.csv"
books.to_csv("books_to_scrape.csv")