In [1]:
!pip freeze

affine==2.4.0
anyio==4.1.0
appnope==0.1.3
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.1.0
Babel==2.13.1
beautifulsoup4==4.12.2
bleach==6.1.0
Cartopy==0.22.0
certifi==2023.11.17
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
comm==0.2.0
contextily==1.4.0
contourpy==1.2.0
cycler==0.12.1
debugpy==1.8.0
decorator==5.1.1
defusedxml==0.7.1
exceptiongroup==1.2.0
executing==2.0.1
fastcore==1.5.29
fastjsonschema==2.19.0
filelock==3.13.1
fiona==1.9.5
fonttools==4.46.0
fqdn==1.5.1
geographiclib==2.0
geopandas==0.14.1
geoplot==0.5.1
geopy==2.4.1
ghapi==1.0.4
idna==3.6
importlib-metadata==7.0.0
importlib-resources==6.1.1
ipykernel==6.27.1
ipython==8.18.1
ipywidgets==8.1.1
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.2
joblib==1.3.2
json5==0.9.14
jsonpointer==2.4
jsonschema==4.20.0
jsonschema-specifications==2023.11.2
jupyter==1.0.0
jupyter-console==6.6.3
jupyter-events==0.9.0
jupyter-lsp==2.2.1
jup

# Scape The Web With Python

Free course from LeWagon.

We'll scape https://books.toscrape.com

In [116]:
from typing import Optional, List, Any, Dict

import requests
from bs4 import BeautifulSoup
import pandas as pd

Make a request to get the page and parse it.

In [30]:
url_prefix = "https://books.toscrape.com/"

In [31]:
url = url_prefix + "index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html, "html.parser")

In [33]:
type(scraped)

bs4.BeautifulSoup

In [12]:
# Extract a single element (the first one with that tag)

title_text = scraped.title.text.strip()
title_text

'All products | Books to Scrape - Sandbox'

In [13]:
scraped.h1.text.strip()

'All products'

Locate specific elements deep in the structure:

* Locate in Inspector
* Climb the tree: Look for uncommon ancestors
* Find that element, using its class name (for example)
* Navigate down again

In [22]:
# Example: Title of first book shown
link_first_book = scraped.find("article", class_="product_pod").h3.a
link_first_book

<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [24]:
# Use link.text to get user-visible text between tags
# Use link["title"] to get value of title attribute (dictionary)

print("Text of link: " + link_first_book.text.strip())
print("Value of title attribute: " + link_first_book["title"].strip())

Text of link: A Light in the ...
Value of title attribute: A Light in the Attic


Let us try to get all sorts of data for all the books!

In [27]:
# This will cover the first page only. Let us start with that

books_data = []
stars_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5,
}

items = scraped.find_all("article", class_="product_pod")
for item in items:
    title = item.h3.a["title"]
    stars_class = item.p["class"]
    num_stars = stars_map[stars_class[-1]]
    price_item = item.find("div", class_="product_price")
    price = price_item.find("p", class_="price_color").text
    instock_text = price_item.find("p", class_="instock availability").text.strip().lower()
    is_instock = instock_text == "in stock"
    books_data.append(
        {
            "title": title,
            "num_stars": num_stars,
            "price": price,
            "is_instock": is_instock,
        }
    )

In [28]:
books_data

[{'title': 'A Light in the Attic',
  'num_stars': 3,
  'price': '£51.77',
  'is_instock': True},
 {'title': 'Tipping the Velvet',
  'num_stars': 1,
  'price': '£53.74',
  'is_instock': True},
 {'title': 'Soumission',
  'num_stars': 1,
  'price': '£50.10',
  'is_instock': True},
 {'title': 'Sharp Objects',
  'num_stars': 4,
  'price': '£47.82',
  'is_instock': True},
 {'title': 'Sapiens: A Brief History of Humankind',
  'num_stars': 5,
  'price': '£54.23',
  'is_instock': True},
 {'title': 'The Requiem Red',
  'num_stars': 1,
  'price': '£22.65',
  'is_instock': True},
 {'title': 'The Dirty Little Secrets of Getting Your Dream Job',
  'num_stars': 4,
  'price': '£33.34',
  'is_instock': True},
 {'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'num_stars': 3,
  'price': '£17.93',
  'is_instock': True},
 {'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'num_stars': 4,
  'pric

There are many pages (currently 50), each with a number of books (currently 20 each). Pages are called `catalogue/page-*.html`. We'd like to scrape data from all these pages.

In [186]:
sub_directory = "catalogue/"

def get_url(url_relative: str) -> str:
    """
    This is a bit annoying. Is there a better way to obtain the correct prefix?
    """
    if url_relative.startswith(sub_directory):
        return url_prefix + url_relative
    else:
        return url_prefix + sub_directory + url_relative

def load_page(url_relative: str, retry: int = 5) -> Optional[BeautifulSoup]:
    url = get_url(url_relative)
    result = None
    for iter in range(retry):
        if iter > 0:
            print(f"Retry download of {url}")
        response = requests.get(url)
        if response.status_code == 200:
            html = response.content
            result = BeautifulSoup(html, "html.parser")
            break;
    return result

def load_books_page(number: int) -> Optional[BeautifulSoup]:
    return load_page(f"{sub_directory}page-{number}.html")

In [187]:
tr_map = {
    "UPC": "upc",
    "Tax": "tax",
    "Availability": "num_available",
    "Number of reviews": "num_reviews",
}

def uk_price_to_float(price: str) -> float:
    return float(price.lstrip("£"))

def get_product_description(article_item: BeautifulSoup) -> Optional[str]:
    # There might be no product description
    # CSS selector. Alternative is find("div", id="product_description")
    title_item = article_item.select_one("div.sub-header#product_description")
    if title_item is not None:
        # The description is inside a <p> tag, just next to the <div> with the
        # title. However, we always have '\n' in between two siblings
        return title_item.next_sibling.next_sibling.text
        # This is an alternative: The description is a <p> tage without class
        # attribute.
        #return article_item.find(
        #    lambda tag: tag.name == "p" and not tag.has_attr("class")
        #).text
    else:
        return None

def get_category(page: BeautifulSoup) -> str:
    # CSS selector: More compact than ``find`` in this case
    return page.select_one("div.page_inner > ul.breadcrumb > li:nth-of-type(3)").text.strip()

def parse_detail_page(url_relative: str) -> Dict[str, Any]:
    """
    Parse out data from detail page for a book. We only extract data which is not
    already contained in the book record.
    """
    result = dict()
    page = load_page(url_relative)
    if page is not None:
        result["category"] = get_category(page)
        article_item = page.find("article", class_="product_page")
        result["description"] = get_product_description(article_item)
        table_items = article_item.table.find_all("tr")
        for table_item in table_items:
            key = tr_map.get(table_item.th.text)
            if key is not None:
                value = table_item.td.text
                if key == "num_available":
                    parts = value.split()
                    if parts[-1] == "available)":
                        value = int(parts[-2][1:])
                    else:
                        value = 0
                elif key == "num_reviews":
                    value = int(value)
                elif key == "tax":
                    value = uk_price_to_float(value)
                result[key] = value
    return result

In [188]:
stars_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5,
}

def parse_book_record(
    item: BeautifulSoup,
    books_data: List[Dict[str, Any]],
    record_pos: int,
    page_number: int,
):
    """
    Parse out data for one book item ``item``, append new record to ``books_data``
    """
    title_item = item.h3.a
    title = title_item["title"]
    url_relative = title_item["href"]
    stars_class = item.p["class"]
    num_stars = stars_map[stars_class[-1]]
    price_item = item.find("div", class_="product_price")
    price = uk_price_to_float(price_item.find("p", class_="price_color").text)
    instock_text = price_item.find("p", class_="instock availability").text.strip().lower()
    is_instock = instock_text == "in stock"
    record = {
        "page_number": page_number,
        "record_pos_on_page": record_pos,
        "title": title,
        "num_stars": num_stars,
        "price": price,
        "is_instock": is_instock,
    }
    print("Load detail page " + url_relative)
    info_detail_page = parse_detail_page(url_relative)
    if info_detail_page:
        record.update(info_detail_page)
    else:
        print("==> Failed!")
    books_data.append(record)

def parse_book_records_on_page(
    page: BeautifulSoup,
    books_data: List[Dict[str, Any]],
    page_number: int,
):
    """
    Parse out data for all books item on page ``page``, append new records to ``books_data``
    """
    items = page.find_all("article", class_="product_pod")
    for pos, item in enumerate(items):
        parse_book_record(item, books_data, record_pos=pos + 1, page_number=page_number)

Let us first download all information from the first page

In [189]:
books_data = []

parse_book_records_on_page(scraped, books_data, page_number=1)

Load detail page catalogue/a-light-in-the-attic_1000/index.html
Load detail page catalogue/tipping-the-velvet_999/index.html
Load detail page catalogue/soumission_998/index.html
Load detail page catalogue/sharp-objects_997/index.html
Load detail page catalogue/sapiens-a-brief-history-of-humankind_996/index.html
Load detail page catalogue/the-requiem-red_995/index.html
Load detail page catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html
Load detail page catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html
Load detail page catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html
Load detail page catalogue/the-black-maria_991/index.html
Load detail page catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html
Load detail page catalogue/shakespeares-sonnets_989/index.html
Load detail page catalogue/set-me-free_988/index.html
Load detail 

In [190]:
books_data

[{'page_number': 1,
  'record_pos_on_page': 1,
  'title': 'A Light in the Attic',
  'num_stars': 3,
  'price': 51.77,
  'is_instock': True,
  'category': 'Poetry',
  'description': "It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in

This works! OK, let us scrape the data from all books pages now.

In [153]:
books_data_fname = "books_data.csv"

books_df = None
page_number = 1
done = False
while not done:
    page = load_books_page(page_number)
    done = page is None
    if not done:
        print(f"\nLoaded page {page_number}\n")
        books_data = []
        parse_book_records_on_page(page, books_data, page_number=page_number)
        books_page_df = pd.DataFrame(books_data)
        if books_df is None:
            books_df = books_page_df
        else:
            books_df = pd.concat([books_df, books_page_df]).reset_index(drop=True)
        # Save results after each page
        print("\nStoring intermediate results to " + books_data_fname)
        books_df.to_csv(books_data_fname, index=False)
        page_number += 1
    else:
        print(f"\nCould not load page {page_number}. Terminating.")


Loaded page 9

Load detail page the-bridge-to-consciousness-im-writing-the-bridge-between-science-and-our-old-and-new-beliefs_840/index.html
Load detail page the-artists-way-a-spiritual-path-to-higher-creativity_839/index.html
Load detail page the-art-of-war_838/index.html
Load detail page the-argonauts_837/index.html
Load detail page the-10-entrepreneur-live-your-startup-dream-without-quitting-your-day-job_836/index.html
Load detail page suddenly-in-love-lake-haven-1_835/index.html
Load detail page something-more-than-this_834/index.html
Load detail page soft-apocalypse_833/index.html
Load detail page so-youve-been-publicly-shamed_832/index.html
Load detail page shoe-dog-a-memoir-by-the-creator-of-nike_831/index.html
Load detail page shobu-samurai-project-aryoku-3_830/index.html
Load detail page secrets-and-lace-fatal-hearts-1_829/index.html
Load detail page scarlett-epstein-hates-it-here_828/index.html
Load detail page romero-and-juliet-a-tragic-tale-of-love-and-zombies_827/index.ht

A more advanced alternative to `find` and `find_all` are to select for CSS tags. This is supported by `select` (or `css.select`) for selecting all, or `select_one` for selecting the first match.
See https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors for details. CSS selectors are more concise and more powerful, but maybe a bit harder to understand.

In [157]:
books_df.head()

Unnamed: 0,page_number,record_pos_on_page,title,num_stars,price,is_instock,description,upc,tax,num_available,num_reviews
0,1,1,A Light in the Attic,3,51.77,True,It's hard to imagine a world without A Light i...,a897fe39b1053632,0.0,22,0
1,1,2,Tipping the Velvet,1,53.74,True,"""Erotic and absorbing...Written with starling ...",90fa61229261140a,0.0,20,0
2,1,3,Soumission,1,50.1,True,"Dans une France assez proche de la nôtre, un h...",6957f44c3847a760,0.0,20,0
3,1,4,Sharp Objects,4,47.82,True,"WICKED above her hipbone, GIRL across her hear...",e00eb4fd7b871a48,0.0,20,0
4,1,5,Sapiens: A Brief History of Humankind,5,54.23,True,From a renowned historian comes a groundbreaki...,4165285e1663650f,0.0,20,0


In [158]:
books_df["description"].describe()

count                                                   998
unique                                                  998
top       It's hard to imagine a world without A Light i...
freq                                                      1
Name: description, dtype: object

In [161]:
books_df[books_df["description"].isna()]

Unnamed: 0,page_number,record_pos_on_page,title,num_stars,price,is_instock,description,upc,tax,num_available,num_reviews
160,9,1,The Bridge to Consciousness: I'm Writing the B...,3,32.0,True,,efc3768127714ec3,0.0,15,0
995,50,16,Alice in Wonderland (Alice's Adventures in Won...,1,55.53,True,,cd2a2a70dd5d176d,0.0,1,0


In [191]:
page = load_page("rip-it-up-and-start-again_986/index.html")
item = page.select_one("div#product_description")
item

<div class="sub-header" id="product_description">
<h2>Product Description</h2>
</div>

In [197]:
item.next_sibling.next_sibling

<p>Punk's raw power rejuvenated rock, but by the summer of 1977 the movement had become a parody of itself. RIP IT UP AND START AGAIN is a celebration of what happened next.Post-punk bands like PiL, Joy Division, Talking Heads, The Fall and The Human League dedicated themselves to fulfilling punk's unfinished musical revolution. The post-punk groups were fervent modernists; w Punk's raw power rejuvenated rock, but by the summer of 1977 the movement had become a parody of itself. RIP IT UP AND START AGAIN is a celebration of what happened next.Post-punk bands like PiL, Joy Division, Talking Heads, The Fall and The Human League dedicated themselves to fulfilling punk's unfinished musical revolution. The post-punk groups were fervent modernists; whether experimenting with electronics and machine rhythm or adapting ideas from dub reggae and disco, they were totally confident they could invent a whole new future for music. ...more</p>