In [1]:
from bs4 import BeautifulSoup
import requests

url = 'https://books.toscrape.com/catalogue/page-1.html'

html = requests.get(url).text

soup = BeautifulSoup(html , "html.parser")

In [2]:
soup.title


<title>
    All products | Books to Scrape - Sandbox
</title>

In [3]:
soup.a

<a href="../index.html">Books to Scrape</a>

In [4]:
link = soup.find("h3").find("a")

title = link.get("title")

In [5]:
title

'A Light in the Attic'

In [7]:
books = soup.find_all("article" , class_="product_pod")


In [8]:
titles = [book.h3.a.get("title") for book in books]
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [10]:
prices = [book.find("p",class_="price_color").text for book in books]
prices

['Â£51.77',
 'Â£53.74',
 'Â£50.10',
 'Â£47.82',
 'Â£54.23',
 'Â£22.65',
 'Â£33.34',
 'Â£17.93',
 'Â£22.60',
 'Â£52.15',
 'Â£13.99',
 'Â£20.66',
 'Â£17.46',
 'Â£52.29',
 'Â£35.02',
 'Â£57.25',
 'Â£23.88',
 'Â£37.59',
 'Â£51.33',
 'Â£45.17']

In [12]:
stock_availability = [book.find("p",class_="instock availability").get_text(strip=True) for book in books]
stock_availability

['In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock']

In [13]:
page_1 = {
  "title":titles,
  "price":prices,
  "stock_availability":stock_availability
}
page_1

{'title': ['A Light in the Attic',
  'Tipping the Velvet',
  'Soumission',
  'Sharp Objects',
  'Sapiens: A Brief History of Humankind',
  'The Requiem Red',
  'The Dirty Little Secrets of Getting Your Dream Job',
  'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'The Black Maria',
  'Starving Hearts (Triangular Trade Trilogy, #1)',
  "Shakespeare's Sonnets",
  'Set Me Free',
  "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
  'Rip it Up and Start Again',
  'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
  'Olio',
  'Mesaerion: The Best Science Fiction Stories 1800-1849',
  'Libertarianism for Beginners',
  "It's Only the Himalayas"],
 'price': ['Â£51.77',
  'Â£53.74',
  'Â£50.10',
  'Â£47.82',
  'Â£54.23',
  'Â£22.65',
  'Â£33.34',
  'Â£17.93',
  'Â£22.60',
  'Â£52.15',
  'Â£13.99',
  'Â£20.66'

In [15]:
all_books = []

for page in range(1,51):
  url = f"http://books.toscrape.com/catalogue/page-{page}.html"
  response = requests.get(url)
  soup = BeautifulSoup(response.text , "html.parser")

  books = soup.find_all("article" , class_="product_pod")

  for book in books:
    title = book.h3.a.get("title")
    price = book.find("p", class_="price_color").get_text(strip=True)
    stock = book.find("p", class_="instock availability").get_text(strip=True)

    all_books.append({
      "title": title,
      "price": price,
      "stock_availability": stock
    })

In [16]:
print(len(all_books))  # should be 1000 books total
print(all_books[:5])  

1000
[{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'stock_availability': 'In stock'}, {'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'stock_availability': 'In stock'}, {'title': 'Soumission', 'price': 'Â£50.10', 'stock_availability': 'In stock'}, {'title': 'Sharp Objects', 'price': 'Â£47.82', 'stock_availability': 'In stock'}, {'title': 'Sapiens: A Brief History of Humankind', 'price': 'Â£54.23', 'stock_availability': 'In stock'}]


In [19]:
import pandas as pd 


df = pd.DataFrame(all_books)

df.to_csv("C:/cs/books.csv", index=False, encoding="utf-8")

