In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
url = "http://books.toscrape.com/catalogue/page-1.html"
# URL of a fictional bookstore, safe for beginners learning web scraping

In [3]:
headers = {"Accept-Language": "en-US, en;q=0.6, en-GB;q=0.5"}

*`Accept-Language`* request HTTP header tells us which (natural) language the user is able to understand.

**Relative quality factor**: 
Preference of language by the user, on a scale of 0 to 1 <br>
*`en-US, en;q=0.6, en-GB;q=0.5`* means American English is preferres, but will accept other types of English and British English.

https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language

In [4]:
page = requests.get(url, headers = headers)

*`request.get(url, headers = headers)`* is used to grab the contents of the URL, in language of our preference 

In [5]:
soup = BeautifulSoup(page.text, "html.parser")
# print(soup.prettify())

*`BeautifulSoup`* method parses the HTML in the *`page`* variable <br>
*`prettify()`* method presents the grabbed information in an easy to read structures tree format 

In [6]:
title = []
price = []
stock = []
rating = []

In [7]:
book_li = soup.find_all("li", attrs = {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})
# Used to store details of all "li" tags, which houses the details of each book in the catalog

In [8]:
for book in book_li:
    
    # Extracting title of the book
    name = book.article.h3.a.text
    title.append(name)
    
    #Extracting the tag that houses both price and stock availability of each book
    product = book.article.find("div", attrs = {"class": "product_price"})
    
    # Extracting the cost of the book
    cost = product.find("p", {"class": "price_color"}).text
    price.append(cost[2:])                                                   # Removing the euro sign in front of each price
    
    # Extracting the stock status of the book 
    available = product.find("p", {"class": "instock availability"}).text
    stock.append(available.lstrip().rstrip())                                # Removing white spaces surrounding the text
    
    # Extracting the rating of the book
    rating_class_attrs = book.article.find("p", {"class": "star-rating"}).attrs
    if ("One" in rating_class_attrs["class"]):
        stars = 1
    elif ("Two" in rating_class_attrs["class"]):
        stars = 2
    elif ("Three" in rating_class_attrs["class"]):
        stars = 3
    elif ("Four" in rating_class_attrs["class"]):
        stars = 4
    elif ("Five" in rating_class_attrs["class"]):
        stars = 5
    rating.append(stars)    

In [9]:
print(title)

['A Light in the ...', 'Tipping the Velvet', 'Soumission', 'Sharp Objects', 'Sapiens: A Brief History ...', 'The Requiem Red', 'The Dirty Little Secrets ...', 'The Coming Woman: A ...', 'The Boys in the ...', 'The Black Maria', 'Starving Hearts (Triangular Trade ...', "Shakespeare's Sonnets", 'Set Me Free', "Scott Pilgrim's Precious Little ...", 'Rip it Up and ...', 'Our Band Could Be ...', 'Olio', 'Mesaerion: The Best Science ...', 'Libertarianism for Beginners', "It's Only the Himalayas"]


In [10]:
print(price)

['51.77', '53.74', '50.10', '47.82', '54.23', '22.65', '33.34', '17.93', '22.60', '52.15', '13.99', '20.66', '17.46', '52.29', '35.02', '57.25', '23.88', '37.59', '51.33', '45.17']


In [11]:
print(stock)

['In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock', 'In stock']


In [12]:
print(rating)

[3, 1, 1, 4, 5, 1, 4, 3, 4, 1, 2, 4, 5, 5, 5, 3, 1, 1, 2, 2]


In [13]:
# Creating a dataframe to store the data in a tabular form
books = pd.DataFrame({
    "Title": title,
    "Price": price,
    "Availability": stock,
    "5 Star Rating": rating
})

In [14]:
print(books.dtypes)

Title            object
Price            object
Availability     object
5 Star Rating     int64
dtype: object


In [15]:
# Changing the data type of "Price" to float
books["Price"] = books["Price"].astype(float)                            

In [16]:
print(books.dtypes)

Title             object
Price            float64
Availability      object
5 Star Rating      int64
dtype: object


In [17]:
print(books)

                                    Title  Price Availability  5 Star Rating
0                      A Light in the ...  51.77     In stock              3
1                      Tipping the Velvet  53.74     In stock              1
2                              Soumission  50.10     In stock              1
3                           Sharp Objects  47.82     In stock              4
4            Sapiens: A Brief History ...  54.23     In stock              5
5                         The Requiem Red  22.65     In stock              1
6            The Dirty Little Secrets ...  33.34     In stock              4
7                 The Coming Woman: A ...  17.93     In stock              3
8                     The Boys in the ...  22.60     In stock              4
9                         The Black Maria  52.15     In stock              1
10  Starving Hearts (Triangular Trade ...  13.99     In stock              2
11                  Shakespeare's Sonnets  20.66     In stock              4

In [18]:
# Saving the resultant data frame in a CSV file named as "books_page_1.csv"
books.to_csv("books_page_1.csv")