In [203]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin 

def scrape_books(min_rating, max_price):
    url = 'https://books.toscrape.com/'
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    grid = soup.find('ol', attrs={'class':'row'})
    books_main = grid.find_all('li', attrs={'class': 'col-xs-6 col-sm-4 col-md-3 col-lg-3'})

    def get_price(book):
        book_price = book.find('p', attrs={'class': 'price_color'}).get_text().strip().replace('£', '')
        return float(book_price)

    def get_rating(book):
        rating_mapping = {
            'One': 1,
            'Two': 2,
            'Three': 3,
            'Four': 4,
            'Five': 5
        }
        rating_tag = book.find('p', class_='star-rating')
        if rating_tag:
            classes = rating_tag['class']
            rating_text = classes[1]
            return rating_mapping.get(rating_text, None)
        return None

    def get_stock(book):
        book_available = book.find('i', class_='icon-ok')
        return 'In Stock' if book_available else 'No'

    def get_url(book):
        domain = 'https://books.toscrape.com'
        url = book.find('a')['href']
        full_url = urljoin(domain, url)  
        return full_url

    def get_upc(book_soup):
        upc_tag = book_soup.find('th', string='UPC')
        if upc_tag:
            upc_value = upc_tag.find_next_sibling('td').text.strip()
            return upc_value
        return None

    def get_title(book_soup):
        title_tag = book_soup.find('h1')
        return title_tag.text.strip() if title_tag else None

    def get_genre(book_soup):
        breadcrumb_links = book_soup.select('ul.breadcrumb a')
        if len(breadcrumb_links) > 1:
            genre_tag = breadcrumb_links[-1]
            genre = genre_tag.text.strip()
            return genre
        return None

    def get_description(book_soup):
        description_tag = book_soup.find('div', id='product_description')
        if description_tag:
            description_para = description_tag.find_next_sibling('p')
            return description_para.text.strip() if description_para else 'No description available'
        return 'No description available'

    def get_detail_data(book_soup):
        upc = get_upc(book_soup)
        title = get_title(book_soup)
        genre = get_genre(book_soup)
        description = get_description(book_soup)
        return upc, title, genre, description

    # Create a dictionary to store book details
    books_data = {}
    index = 0

    # Loop through each book element on the main page
    for book in books_main:
        price = get_price(book)
        rating = get_rating(book)

        # Apply filtering conditions
        if rating is not None and price is not None and rating >= min_rating and price <= max_price:
            stock = get_stock(book)
            book_url = get_url(book)
            
            # Fetch details from the detail page
            detail_response = requests.get(book_url)
            book_soup = BeautifulSoup(detail_response.content, 'html.parser')
            
            upc, title, genre, description = get_detail_data(book_soup)

            # Store data for each book
            books_data[index] = {
                'Title': title,
                'Genre': genre,
                'Rating': rating,
                'Price (£)': price,
                'UPC': upc,
                'Availability': stock,
                'Description': description
            }
            index += 1

    # Convert the dictionary to a DataFrame
    df_books = pd.DataFrame.from_dict(books_data, orient='index')

    return df_books

df = scrape_books(4.0, 20.0)
display(df)

Unnamed: 0,Title,Genre,Rating,Price (£),UPC,Availability,Description
0,Set Me Free,Young Adult,5,17.46,ce6396b0f23f6ecc,In Stock,Aaron Ledbetter’s future had been planned out ...


In [183]:
url = 'https://books.toscrape.com/'
response = requests.get(url)

In [18]:
Expected Outcome

A function named scrape_books that takes two parameters: min_rating and max_price. 
The function should scrape book data from the "Books to Scrape" website and return a pandas DataFrame with the following columns:

Expected Outcome

A function named scrape_books that takes two parameters: min_rating and max_price.

The function should return a DataFrame with the following columns:

#UPC: The Universal Product Code (UPC) of the book. # inside book page
#Title: The title of the book. # inside booke page
#Price (£): The price of the book in pounds. # main page
#Rating: The rating of the book (1-5 stars). # main page
#Genre: The genre of the book. # inside book page
#Availability: Whether the book is in stock or not. # main page
#Description: A brief description or product description of the book (if available).

You will execute this script to scrape data for books with a minimum rating of 4.0 and above and a maximum price of £20.

Next, think about how you can set parameters for your data extraction:

Minimum Rating: Focus on books with a rating of 4.0 and above.
Maximum Price: Filter for books priced up to £20.

SyntaxError: invalid character '£' (U+00A3) (2785871831.py, line 14)

In [184]:
soup = BeautifulSoup(response.content)
grid = soup.find('ol', attrs = {'class':'row'} )
books_main = grid.find_all ('li', attrs = {'class': 'col-xs-6 col-sm-4 col-md-3 col-lg-3'} )


In [185]:
def get_price (book):
    book_price = book.find('p', attrs = {'class': 'price_color'} ).get_text().strip().replace('£','')
    return float(book_price)

51.77

In [186]:
def get_rating(book):
    rating_mapping = {
        'One': 1,
        'Two': 2,
        'Three': 3,
        'Four': 4,
        'Five': 5}
    
    rating_tag = book.find('p', class_='star-rating')
    if rating_tag:
        classes = rating_tag['class']
        rating_text = classes[1]
        return rating_mapping.get(rating_text, None)
    
    return None 


In [187]:
def get_stock(book):
    book_available = book.find('i', class_='icon-ok')
    if book_available:
        return 'In Stock'
    return 'No'
    

In [188]:
def get_url(book):
    domain = 'https://books.toscrape.com'
    url = book.find('a')['href']
    full_url = urljoin(domain, url)  
    return full_url


In [189]:
def get_upc(book_soup):
    upc_tag = book_soup.find('th', string='UPC')
    if upc_tag:
        upc_value = upc_tag.find_next_sibling('td').text.strip()
        return upc_value
    return None 

In [190]:

def get_title(book_soup):
    title_tag = book_soup.find('h1').text.strip()
    return title_tag


In [191]:
def get_genre(book_soup):
    breadcrumb_links = book_soup.select('ul.breadcrumb a')
    if len(breadcrumb_links) > 1:
        genre_tag = breadcrumb_links[-1]
        genre = genre_tag.text.strip()
        return genre
    return None

In [192]:
def get_detail_data(book_soup):
    upc = get_upc(book_soup)
    title = get_title(book_soup)
    genre = get_genre(book_soup)
    description = get_description(book_soup)
    return upc, title, genre, description

In [193]:
def get_description(book_soup):
    description_tag = book_soup.find('div', id='product_description')
    if description_tag:
        description_para = description_tag.find_next_sibling('p')
        return description_para.text.strip() if description_para else 'No description available'
    return 'No description available'

In [197]:
# Define filtering criteria
min_rating = 4.0
max_price = 20.0

# Create a dictionary to store book details
books_data = {}
index = 0


# Loop through each book element on the main page
for book in books_main:
    price = get_price(book)
    rating = get_rating(book)

    # Apply filtering conditions
    if rating is not None and price is not None and rating >= min_rating and price <= max_price:
        stock = get_stock(book)
        book_url = get_url(book)
        
        # Fetch details from the detail page
        detail_response = requests.get(book_url)
        book_soup = BeautifulSoup(detail_response.content, 'html.parser')  # Added 'html.parser' for clarity
        
        upc, title, genre, description = get_detail_data(book_soup)

        # Store data for each book
        books_data[index] = {
            'Title': title,
            'Genre': genre,
            'Rating': rating,
            'Price (£)': price,
            'UPC': upc,
            'Availability': stock,
            'Description': description
        }
        index += 1

# Convert the dictionary to a DataFrame
df_books = pd.DataFrame.from_dict(books_data, orient='index')

# Display the DataFrame
display(df_books)

Unnamed: 0,Title,Genre,Rating,Price (£),UPC,Availability,Description
0,Set Me Free,Young Adult,5,17.46,ce6396b0f23f6ecc,In Stock,Aaron Ledbetter’s future had been planned out ...


In [None]:
 # Find the next page link and update the base_url
    next_button = soup.find('li', class_='next')
    if next_button:
        next_page_url = next_button.find('a')['href']
        url = urljoin(url, next_page_url)
    else:
        break



while True:
    response = requests.get(url)
    soup = BeautifulSoup(response.content)