In [1]:
import time
import random
import requests
import pickle
import csv
from urllib.parse import urljoin
from bs4 import BeautifulSoup as Soup

In [30]:
class WebScraper:

    # User Agent from Chrome Browser on Win 10/11
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}

    default_sleep = 5.0 # These may need tuning
    sigma = 1.0

    save_clock = 30

    def get(url: str) -> requests.Response:
        """Waits a random amount of time, then send a GET request"""
        time.sleep(abs(random.gauss(WebScraper.default_sleep, WebScraper.sigma)))
        return requests.get(url, headers=WebScraper.headers)

    def __init__(self, domain, start='/index.html', state_filename='webscrape.dat', output_filename='output.csv'):
        self._links = [urljoin(domain, start)]
        self._data = {}
        self._links_processed = 0
        self._state_filename = state_filename
        self._output_filename = output_filename

    def save_state(self):
        with open(self._state_filename, 'wb') as statefile:
            pickle.dump(self, statefile)

    def load_state(state_filename='webscrape.dat'):
        with open(state_filename, 'rb') as statefile:
            return pickle.load(statefile)

    def extract_links(page, from_url) -> list:
        links = page.find_all('a')
        return [urljoin(from_url, link['href']) for link in links if 'href' in link.attrs]

    def extract_data(self, page) -> dict:
        raise NotImplementedError

    def crawl(self):
        while len(self._links) > 0:
            # main loop for crawling a website


In [31]:
books = WebScraper('http://books.toscrape.com')

In [32]:
books.crawl()

Processing http://books.toscrape.com/index.html… Finished
Processing http://books.toscrape.com/catalogue/page-2.html… Finished
Processing http://books.toscrape.com/catalogue/page-3.html… Finished
Processing http://books.toscrape.com/catalogue/page-4.html… Finished
Processing http://books.toscrape.com/catalogue/page-5.html… Finished
Processing http://books.toscrape.com/catalogue/page-6.html… Finished
Processing http://books.toscrape.com/catalogue/page-7.html… Finished
Processing http://books.toscrape.com/catalogue/page-8.html… Finished
Processing http://books.toscrape.com/catalogue/page-9.html… Finished
Processing http://books.toscrape.com/catalogue/page-10.html…

In [33]:
books._links
    

['http://books.toscrape.com/index.html',
 'http://books.toscrape.com/index.html',
 'http://books.toscrape.com/catalogue/category/books_1/index.html',
 'http://books.toscrape.com/catalogue/category/books/travel_2/index.html',
 'http://books.toscrape.com/catalogue/category/books/mystery_3/index.html',
 'http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html',
 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html',
 'http://books.toscrape.com/catalogue/category/books/classics_6/index.html',
 'http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html',
 'http://books.toscrape.com/catalogue/category/books/romance_8/index.html',
 'http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html',
 'http://books.toscrape.com/catalogue/category/books/fiction_10/index.html',
 'http://books.toscrape.com/catalogue/category/books/childrens_11/index.html',
 'http://books.toscrape.com/catalogue/category/books/relig

In [34]:
books._data

{'http://books.toscrape.com/index.html': None,
 'http://books.toscrape.com/catalogue/page-2.html': None,
 'http://books.toscrape.com/catalogue/page-3.html': None,
 'http://books.toscrape.com/catalogue/page-4.html': None,
 'http://books.toscrape.com/catalogue/page-5.html': None,
 'http://books.toscrape.com/catalogue/page-6.html': None,
 'http://books.toscrape.com/catalogue/page-7.html': None,
 'http://books.toscrape.com/catalogue/page-8.html': None,
 'http://books.toscrape.com/catalogue/page-9.html': None,
 'http://books.toscrape.com/catalogue/page-10.html': None}

In [35]:
class ToScrape (WebScraper):

    fields = ['Title', 'Category', 'UPC', 'Product Type', 'Price (excl. tax)', 'Price (incl. tax)', 'Tax', 'Availability', 'Number of reviews']
    categories = ('Travel', 'Mystery', 'Historical Fiction', 'Sequential Art', 'Classics', 'Philosophy', 'Romance', 'Womens Fiction', 'Fiction', 'Childrens', 'Religion', 'Nonfiction', 'Music', 'Default', 'Science Fiction', 'Sports and Games', 'Add a comment', 'Fantasy', 'New Adult', 'Young Adult', 'Science', 'Poetry', 'Paranormal', 'Art', 'Psychology', 'Autobiography', 'Parenting', 'Adult Fiction', 'Humor', 'Horror', 'History', 'Food and Drink', 'Christian Fiction', 'Business', 'Biography', 'Thriller', 'Contemporary', 'Spirituality', 'Academic', 'Self Help', 'Historical', 'Christian', 'Suspense', 'Short Stories', 'Novels', 'Health', 'Politics', 'Cultural', 'Erotica', 'Crime')

    def extract_data(self, page) -> dict:
        book_data = {}
        # toscrape.com specific data collection
        return book_data

In [39]:
books = ToScrape('http://books.toscrape.com', start='/catalogue/the-age-of-genius-the-seventeenth-century-and-the-birth-of-the-modern-mind_929/index.html')

In [40]:
books.crawl()

Processing http://books.toscrape.com/catalogue/the-age-of-genius-the-seventeenth-century-and-the-birth-of-the-modern-mind_929/index.html… Finished
Processing http://books.toscrape.com/catalogue/it_330/index.html… Finished
Processing http://books.toscrape.com/catalogue/can-you-keep-a-secret-fear-street-relaunch-4_614/index.html… Finished
Processing http://books.toscrape.com/catalogue/security_925/index.html… Finished
Processing http://books.toscrape.com/catalogue/dress-your-family-in-corduroy-and-denim_562/index.html… Finished
Processing http://books.toscrape.com/catalogue/catastrophic-happiness-finding-joy-in-childhoods-messy-years_138/index.html… Finished
Processing http://books.toscrape.com/catalogue/approval-junkie-adventures-in-caring-too-much_363/index.html… Finished
Processing http://books.toscrape.com/catalogue/thinking-fast-and-slow_289/index.html… Finished
Processing http://books.toscrape.com/catalogue/the-art-book_490/index.html… Finished
Processing http://books.toscrape.com/

In [41]:
books._data

{'http://books.toscrape.com/catalogue/the-age-of-genius-the-seventeenth-century-and-the-birth-of-the-modern-mind_929/index.html': {'UPC': 'a3d2a4250807f1e9',
  'Product Type': 'Books',
  'Price (excl. tax)': 'Â£19.73',
  'Price (incl. tax)': 'Â£19.73',
  'Tax': 'Â£0.00',
  'Availability': 'In stock (16 available)',
  'Number of reviews': '0',
  'Category': 'History',
  'Title': 'The Age of Genius: The Seventeenth Century and the Birth of the Modern Mind'},
 'http://books.toscrape.com/catalogue/it_330/index.html': {'UPC': '670a2773607d785d',
  'Product Type': 'Books',
  'Price (excl. tax)': 'Â£25.01',
  'Price (incl. tax)': 'Â£25.01',
  'Tax': 'Â£0.00',
  'Availability': 'In stock (4 available)',
  'Number of reviews': '0',
  'Category': 'Horror',
  'Title': 'It'},
 'http://books.toscrape.com/catalogue/can-you-keep-a-secret-fear-street-relaunch-4_614/index.html': {'UPC': 'd069086944f2e330',
  'Product Type': 'Books',
  'Price (excl. tax)': 'Â£48.64',
  'Price (incl. tax)': 'Â£48.64',
  

In [None]:
try:
    for page in website:
      # extract data
    except KeyboardInterrupt 

In [None]:
class SiteCrawler:

    __iter__(self):
       