### **Scraping a book website**

First, go to this link:

https://books.toscrape.com/



For today, let's just focus on the music section:

https://books.toscrape.com/catalogue/category/books/music_14/index.html

Our goal will be to save the following information to a csv file for every book in the music section:
- Title
- Rating
- UPC
- Product Type
- Price (excl. tax)
- Price (incl. tax)
- Tax
- Availability
- Number of reviews




In [None]:
# installing libaries: you will only need to run this once
# can also be ran in your terminal with: pip install requests beautifulsoup4 pandas

import sys
!{sys.executable} -m pip install --user requests beautifulsoup4 pandas


In [None]:
# importing libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import re
import time

In [None]:
# step 1: make a request to the website and get the HTML content
url = "https://books.toscrape.com/catalogue/category/books/music_14/index.html" # this is the url to the website we want to scrape

response = requests.get(url) # this sends a GET request to the website and stores the response in the variable "response"

print("status code:", response.status_code) # this prints the status code of the response. A status code of 200 means the request was successful, while a status code of 404 means the page was not found.

html = response.text # this gets the HTML content of the page as a string and stores it in the variable "html"

print(html[:1000]) # this prints the first 1000 characters of the HTML content

Now go back to this link: https://books.toscrape.com/catalogue/category/books/music_14/index.html

1. Right click
2. Inspect
3. Click the top left icon that looks like a square with an arrow in the bottom right corner
4. Hover over the area for the first book
5. Click the corresponding html element that is highlighted
6. Copy it
7. Paste it below
8. Go back and hover over the lists of similar elements-- notice how each "product pod" is getting highlighted as you do



In [None]:
# it should look like this:
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
    <article class="product_pod">
            <div class="image_container">
                    <a href="../../../rip-it-up-and-start-again_986/index.html"><img src="../../../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg" alt="Rip it Up and Start Again" class="thumbnail"></a>
            </div>
                <p class="star-rating Five">
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                </p>
            <h3><a href="../../../rip-it-up-and-start-again_986/index.html" title="Rip it Up and Start Again">Rip it Up and ...</a></h3> # this is an important line-- it gives us the hyperlink to the page
            <div class="product_price">
        <p class="price_color">Â£35.02</p>
<p class="instock availability">
    <i class="icon-ok"></i>
        In stock
</p>
    <form>
        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
    </form>            
            </div>
    </article>
</li>

In [None]:
# let's pull each hyperlink from each product pod so that we can access the individual pages for each book
soup = BeautifulSoup(html, "html.parser") # creates a beautifulsoup object from the HTML content, which allows us to easily navigate and search the HTML structure
product_pods = soup.find_all("article", class_="product_pod") # this finds all the "article" tags with the class "product_pod" and stores them in a list called "product_pods". Each "article" tag represents a product on the page.
links = [] 

for pod in product_pods: # this loops through each product pod in the list of product pods
    link = pod.h3.a["href"] # this gets the hyperlink from the "a" tag inside the "h3" tag of the product pod. The "href" attribute contains the URL of the individual book page.
    links.append(link) # this adds the hyperlink to the list of existing hyperlinks

In [None]:
print(len(links))

In [None]:
for link in links:
    print(link)

Notice that these are relative hyperlinks. if we tried to copy and paste these into a browser, they would not return a valid page

But let's try clicking on the first book on the page and looking at the hyperlink structure: https://books.toscrape.com/catalogue/rip-it-up-and-start-again_986/index.html

So if we append "https://books.toscrape.com/catalogue" to the beginning of our hyperlinks, this will return valid pages


In [None]:
from urllib.parse import urljoin

page_url = "https://books.toscrape.com/catalogue/category/books/music_14/index.html" # this is the url of the page we are scraping, which we will use as the base url to create the full urls for each book page

clean_links = [urljoin(page_url, link) for link in links] # this creates a new list called "clean_links" that contains the full urls for each book page by joining the base url with each hyperlink in the "links" list using the urljoin function from the urllib.parse library

for link in clean_links: 
    print(link)

 # now we can loop through each of the clean links and make a request to each book page to get more information about each book, such as the title, price, stock availability, and star rating

In [None]:
# now let's make a request to the first link in our list and examine its html structure
first_book_url = clean_links[0]

response = requests.get(first_book_url)
print("status code:", response.status_code)

book_html = response.text

book_soup = BeautifulSoup(book_html, "html.parser")

print(book_soup.prettify())


In [None]:
# title
title = book_soup.find("div", class_="product_main").find("h1").get_text(strip=True)

# rating (stored as a class, e.g. "star-rating Five")
rating = book_soup.find("p", class_="star-rating")["class"][1]

print("title:", title)
print("rating:", rating)


In [None]:
# product information table
table_rows = book_soup.find("table", class_="table table-striped").find_all("tr")

product_info = {}
for row in table_rows:
    key = row.find("th").get_text(strip=True)
    value = row.find("td").get_text(strip=True)
    product_info[key] = value

print(table_rows)


In [None]:
print("upc:", product_info.get("UPC"))
print("product type:", product_info.get("Product Type"))
print("price (excl. tax):", product_info.get("Price (excl. tax)"))
print("price (incl. tax):", product_info.get("Price (incl. tax)"))
print("tax:", product_info.get("Tax"))
print("availability:", product_info.get("Availability"))
print("number of reviews:", product_info.get("Number of reviews"))

In [None]:
# cleaning output a bit further

# --- clean price fields: extract numeric values ---
price_excl_tax = float(
    re.search(r"[\d.]+", product_info["Price (excl. tax)"]).group()
)

price_incl_tax = float(
    re.search(r"[\d.]+", product_info["Price (incl. tax)"]).group()
)

tax = float(
    re.search(r"[\d.]+", product_info["Tax"]).group()
)

# --- clean availability ---
availability_text = product_info["Availability"]

# availability flag
available_flag = "y" if "In stock" in availability_text else "n"

# number available (extract integer)
match = re.search(r"\((\d+) available\)", availability_text)
number_available = int(match.group(1)) if match else None

# print cleaned results
print("price_excl_tax:", price_excl_tax)
print("price_incl_tax:", price_incl_tax)
print("tax:", tax)
print("available_flag:", available_flag)
print("number_available:", number_available)

In [None]:
rows = []
total = len(clean_links)

for i, url in enumerate(clean_links, start=1): # loops through each clean link in our list
    print(f"scraping {i}/{total}: {url}")

    response = requests.get(url) # grabs the html for the url
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser") 

    title = soup.find("div", class_="product_main").find("h1").get_text(strip=True) # title
    rating = soup.find("p", class_="star-rating")["class"][1] # rating

    table_rows = soup.find("table", class_="table table-striped").find_all("tr") # product information table
    product_info = {}
    for row in table_rows:
        key = row.find("th").get_text(strip=True)
        value = row.find("td").get_text(strip=True)
        product_info[key] = value

    price_excl_tax = float(re.search(r"[\d.]+", product_info["Price (excl. tax)"]).group()) # cleans prices
    price_incl_tax = float(re.search(r"[\d.]+", product_info["Price (incl. tax)"]).group()) # cleans prices
    tax = float(re.search(r"[\d.]+", product_info["Tax"]).group()) # cleans taxes

    availability_text = product_info["Availability"] 
    available_flag = "y" if "In stock" in availability_text else "n" # cleans availability
    match = re.search(r"\((\d+)\s+available\)", availability_text)
    number_available = int(match.group(1)) if match else None
 
    rows.append({ # saves the final dataframe 
        "book_url": url, 
        "title": title,
        "rating": rating,
        "upc": product_info.get("UPC"),
        "product_type": product_info.get("Product Type"),
        "price_excl_tax": price_excl_tax,
        "price_incl_tax": price_incl_tax,
        "tax": tax,
        "available_flag": available_flag,
        "number_available": number_available,
        "number_of_reviews": product_info.get("Number of reviews"),
    })

    time.sleep(0.5)

df_books = pd.DataFrame(rows)
df_books

In [None]:
df_books.to_csv("books.csv", index=False)