<a href="https://colab.research.google.com/github/osamanoor17/BeautifulSoup-Fundamentals/blob/main/BeautifulSoup_Fundamentals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Installing Required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging

In [None]:
# Configure logging
logging.basicConfig(
    filename='scraping_log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

 Task 1 - Basic Book Scraping (First Page Only)

In [None]:
def scrape_first_page_books():
    # """
    # Scrapes book data from the first page of 'http://books.toscrape.com/'
    # Returns:
    #     DataFrame: A pandas DataFrame with book title, price, availability, and star rating.
    # """
    url = "http://books.toscrape.com/"
    response = requests.get(url)

    if response.status_code != 200:
        logging.error(f"Failed to fetch page: {url}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    books = soup.select('article.product_pod')

    book_list = []

    for book in books:
        title = book.h3.a['title']
        price = book.select_one('p.price_color').text.strip()[1:]  # Remove '£'
        availability = book.select_one('p.instock.availability').text.strip()
        star = book.select_one('p.star-rating')['class'][1]  # e.g., "Three"

        book_list.append({
            'Title': title,
            'Price': float(price),
            'Availability': availability,
            'Star Rating': star
        })

    df = pd.DataFrame(book_list)
    df.to_csv("books_basic.csv", index=False)
    logging.info("First page scraped and saved to books_basic.csv")
    return df

# Run the function and preview the data
df_basic = scrape_first_page_books()
df_basic.head()


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,51.77,In stock,Three
1,Tipping the Velvet,53.74,In stock,One
2,Soumission,50.1,In stock,One
3,Sharp Objects,47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,54.23,In stock,Five


Task 2 - Multi-page Scraping (First 3 Pages)

In [None]:
def scrape_multiple_pages(num_pages=3):
    """
    Scrapes multiple pages from the book site and stores data in a CSV file.
    Args:
        num_pages (int): Number of pages to scrape (default: 3)
    """
    base_url = "http://books.toscrape.com/catalogue/page-{}.html"
    all_books = []

    for page in range(1, num_pages + 1):
        url = base_url.format(page)
        logging.info(f"Scraping page {page}: {url}")
        response = requests.get(url)

        if response.status_code != 200:
            logging.error(f"Failed to fetch page {page}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        books = soup.select('article.product_pod')

        for book in books:
            title = book.h3.a['title']
            price = book.select_one('p.price_color').text.strip()[1:]
            availability = book.select_one('p.instock.availability').text.strip()
            star = book.select_one('p.star-rating')['class'][1]

            all_books.append({
                'Title': title,
                'Price': float(price),
                'Availability': availability,
                'Star Rating': star
            })

        time.sleep(2)  # Rate limiting

    df = pd.DataFrame(all_books)
    df.to_csv("books_multipage.csv", index=False)
    logging.info("Scraping of multiple pages completed and saved to books_multipage.csv")
    return df

# Run the function
df_multi = scrape_multiple_pages()
df_multi.head()


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,51.77,In stock,Three
1,Tipping the Velvet,53.74,In stock,One
2,Soumission,50.1,In stock,One
3,Sharp Objects,47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,54.23,In stock,Five


 Additional Feature - Data Summary

In [None]:
def generate_summary(df):
    """
    Generates and prints a summary of the scraped book data.
    Args:
        df (DataFrame): The DataFrame to summarize
    """
    total_books = len(df)
    available_books = df[df['Availability'].str.contains('In stock')].shape[0]
    unavailable_books = total_books - available_books
    star_distribution = df['Star Rating'].value_counts()
    average_price = df['Price'].mean()

    print("Total Books Scraped:", total_books)
    print("Available Books:", available_books)
    print("Unavailable Books:", unavailable_books)
    print("\nStar Rating Distribution:\n", star_distribution)
    print("\nAverage Book Price: {:.2f}".format(average_price))

# Run summary
generate_summary(df_multi)


Total Books Scraped: 60
Available Books: 60
Unavailable Books: 0

Star Rating Distribution:
 Star Rating
One      15
Five     14
Three    13
Four     10
Two       8
Name: count, dtype: int64

Average Book Price: 35.00
