In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

base_url = "https://books.toscrape.com/catalogue/category/books_1/index.html"

def get_soup(url):
    res = requests.get(url)
    res.raise_for_status()
    return BeautifulSoup(res.text, "html.parser")

def get_price(book):
    price_text = book.find("p", class_="price_color").get_text()
    price_clean = ''.join(ch for ch in price_text if ch.isdigit() or ch == '.' or ch == '-')
    return float(price_clean)

def get_title(soup):
    return soup.find("h1").text.strip()

def star_rating(book):
    rating_tag = book.find("p", class_="star-rating")
    classes = rating_tag.get("class", [])
    return classes[1] if len(classes) > 1 else "No rating"

def get_UPC(soup):
    table = soup.find("table", class_="table table-striped")
    for row in table.find_all("tr"):
        if row.find("th").text.strip() == "UPC":
            return row.find("td").text.strip()
    return None

def get_genre(soup):
    ul = soup.find("ul", class_="breadcrumb")
    links = ul.find_all("a")
    if len(links) > 2:
        return links[2].text.strip()
    return None

def description(soup):
    desc_header = soup.find("div", id="product_description")
    if desc_header:
        desc_paragraph = desc_header.find_next_sibling("p")
        return desc_paragraph.text.strip()
    return None

def get_book_info(book):
    relative_link = book.find("h3").find("a")["href"]
    book_url = urljoin(base_url, relative_link)
    book_soup = get_soup(book_url)

    return {
        "title": get_title(book_soup),
        "price": get_price(book),
        "rating": star_rating(book),
        "UPC": get_UPC(book_soup),
        "genre": get_genre(book_soup),
        "description": description(book_soup),
        "url": book_url
    }

def scrape_all_books():
    page_url = base_url
    all_books = []

    while page_url:
        soup = get_soup(page_url)
        books = soup.select("article.product_pod")

        for book in books:
            info = get_book_info(book)  
            all_books.append(info)

        next_btn = soup.select_one("li.next a")
        if next_btn:
            next_href = next_btn['href']
            page_url = urljoin(page_url, next_href)
        else:
            page_url = None

    return all_books

# Run the scraper and get all data
all_books_data = scrape_all_books()


In [9]:
print("Rows in df:", len(df))
print("Columns:", df.columns.tolist())

NameError: name 'df' is not defined

In [11]:
print(f"Total books scraped: {len(all_books_data)}")


Total books scraped: 1000


In [17]:
df = pd.DataFrame(all_books_data)
print(df.head())
print("Rows in df:", len(df))
print("Columns:", df.columns.tolist())

                                   title  price rating               UPC  \
0                   A Light in the Attic  51.77  Three  a897fe39b1053632   
1                     Tipping the Velvet  53.74    One  90fa61229261140a   
2                             Soumission  50.10    One  6957f44c3847a760   
3                          Sharp Objects  47.82   Four  e00eb4fd7b871a48   
4  Sapiens: A Brief History of Humankind  54.23   Five  4165285e1663650f   

                genre                                        description  \
0              Poetry  It's hard to imagine a world without A Light i...   
1  Historical Fiction  "Erotic and absorbing...Written with starling ...   
2             Fiction  Dans une France assez proche de la nÃ´tre, un ...   
3             Mystery  WICKED above her hipbone, GIRL across her hear...   
4             History  From a renowned historian comes a groundbreaki...   

                                                 url  
0  https://books.toscrape.com/c

In [15]:
rating_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

df["rating_num"] = df["rating"].map(rating_map)

filtered_df = df[(df["price"] > 20) & (df["rating_num"] >= 4)]

print(filtered_df)


                                                 title  price rating  \
3                                        Sharp Objects  47.82   Four   
4                Sapiens: A Brief History of Humankind  54.23   Five   
6    The Dirty Little Secrets of Getting Your Dream...  33.34   Four   
8    The Boys in the Boat: Nine Americans and Their...  22.60   Four   
11                               Shakespeare's Sonnets  20.66   Four   
..                                                 ...    ...    ...   
989                                       Bright Lines  39.07   Five   
991                      Bounty (Colorado Mountain #7)  37.26   Four   
993  Bleach, Vol. 1: Strawberry and the Soul Reaper...  34.65   Five   
996   Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)  57.06   Four   
999                 1,000 Places to See Before You Die  26.08   Five   

                  UPC           genre  \
3    e00eb4fd7b871a48         Mystery   
4    4165285e1663650f         History   
6    2597b5a

In [19]:
print(filtered_df.head())

                                                title  price rating  \
3                                       Sharp Objects  47.82   Four   
4               Sapiens: A Brief History of Humankind  54.23   Five   
6   The Dirty Little Secrets of Getting Your Dream...  33.34   Four   
8   The Boys in the Boat: Nine Americans and Their...  22.60   Four   
11                              Shakespeare's Sonnets  20.66   Four   

                 UPC     genre  \
3   e00eb4fd7b871a48   Mystery   
4   4165285e1663650f   History   
6   2597b5a345f45e1b  Business   
8   e10e1e165dc8be4a   Default   
11  30a7f60cd76ca58c    Poetry   

                                          description  \
3   WICKED above her hipbone, GIRL across her hear...   
4   From a renowned historian comes a groundbreaki...   
6   Drawing on his extensive experience evaluating...   
8   For readers of Laura Hillenbrand's Seabiscuit ...   
11  This book is an important and complete collect...   

                         