In [0]:
# https://towardsdatascience.com/an-introduction-to-web-scraping-with-python-a2601e8619e5
main_url = 'http://books.toscrape.com/index.html'

import requests
result = requests.get(main_url)

In [0]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(result.text, 'html.parser')

print(soup.prettify()[:1000])

In [0]:
def getAndParseURL(url):
  result = requests.get(url)
  soup = BeautifulSoup(result.text, 'html.parser')
  return(soup)

In [0]:
soup.find("article", class_ = "product_pod")

In [0]:
soup.find("article", class_ = "product_pod").div.a

In [0]:
soup.find("article", class_ = "product_pod").div.a.get('href')

In [0]:
main_page_products_urls = [x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")]

print(str(len(main_page_products_urls)) + " fetched products URLs")
print("One example:")
main_page_products_urls[0]

In [0]:
def getBooksURLs(url):
  soup = getAndParseURL(url)
  # remove the index.html part of the base url before returning the results
  return(["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")])

In [0]:
import re

categories_urls = [main_url + x.get('href') for x in soup.find_all("a", href=re.compile("catalogue/category/books"))]
categories_urls = categories_urls[1:] # we remove the first one becuase it corresponds to all the books

print(str(len(categories_urls)) + " fetched categories URLs")
print("Some examples:")
categories_urls[:5]

In [0]:
# store all the results in a list
pages_urls = [main_url]

soup = getAndParseURL(pages_urls[0])

# while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
# if there is only one button, this means that we are either on the first page or the last page
# we stop when we get to the last page

while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:
  
  # get the new complete url by adding the fetched url to the base url and removing the .html part
  new_url = "/".join(pages_urls[-1].split("/")[:-1]) + "/" + soup.findAll("a", href=re.compile("page")) [-1].get("href")
  
  # add the URL to the list
  pages_urls.append(new_url)
  
  # parse the next page
  soup = getAndParseURL(new_url)
  
print(str(len(pages_urls)) + " fetched URLs")
print("Some examples:")
pages_urls[:5]

In [0]:
result = requests.get("http://books.toscrape.com/catalogue/page-50.html")
print("status code for page 50: " + str(result.status_code))

result = requests.get("http://books.toscrape.com/catalogue/page-51.html")
print("status code for page 51: " + str(result.status_code))

In [0]:
pages_urls = []

new_page= "http://books.toscrape.com/catalogue/page-1.html"
while requests.get(new_page).status_code == 200:
  pages_urls.append(new_page)
  new_page = pages_urls[-1].split("-")[0] + "-" + str(int(pages_urls[-1].split("-")[1].split(".")[0]) + 1) + ".html"
  
print(str(len(pages_urls)) + " fetched URLs")
print("Some examples:")
pages_urls[:5]

In [0]:
booksURLs = []
for page in pages_urls:
  booksURLs.extend(getBooksURLs(page))
  
print(str(len(booksURLs)) + " fetched URLs")
print("Some examples:")
booksURLs[:5]

In [0]:
names = []
prices = []
nb_in_stock = []
img_urls = []
categories = []
ratings = []

# scrape data for every book in url: this may take some time
for url in booksURLs:
  soup = getAndParseURL(url)
  # product name
  names.append(soup.find("div", class_ = re.compile("product_main")).h1.text)
  # product price
  prices.append(soup.find("p", class_="price_color").text[2:]) # get rid of the pound sign
  # number of available products
  nb_in_stock.append(re.sub("[&0-9]", "", soup.find("p", class_ = "instock availability").text))
  # image url
  img_urls.append(url.replace("index.html", "") + soup.find("img").get("src"))
  # product category
  categories.append(soup.find("a", href = re.compile("../category/books")).get("href").split("/")[3])
  # ratings
  ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1])
  
 # add data into pandas df
import pandas as pd

scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, 'url_img':img_urls, 'product_category': categories, 'rating': ratings})
scraped_data.head()