
# Web Scraping Example with BeautifulSoup + Functions & Visualization

## Objective:
This notebook demonstrates how to:
- Encapsulate scraping logic in functions
- Parse HTML content with BeautifulSoup
- Extract structured data (book titles, prices, ratings)
- Store the data in a pandas DataFrame
- Perform data cleaning and transformations in a separate function
- Use basic data visualization to explore the results

Although the data here is about books, these techniques apply to many domains, including the healthcare industry. The skills demonstrated—web scraping, data cleaning, and visualization—are highly relevant to data-driven roles across industries.


In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure plots render inline for Jupyter Notebooks
%matplotlib inline


In [None]:

def scrape_books(num_pages=5):
    """
    Scrapes books data from 'http://books.toscrape.com' for the specified number of pages.
    
    Parameters:
        num_pages (int): Number of pages to scrape.
        
    Returns:
        pd.DataFrame: DataFrame containing titles, prices, availability, and rating data.
    """
    base_url = "http://books.toscrape.com/catalogue/page-{}.html"
    all_titles = []
    all_prices = []
    all_availability = []
    all_ratings = []
    
    for page in range(1, num_pages + 1):
        url = base_url.format(page)
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            books = soup.find_all('article', class_='product_pod')
            
            for book in books:
                # Title
                title = book.h3.a['title']
                
                # Price
                price = book.find('p', class_='price_color').text
                cleaned_price = price.replace('£', '').encode('ascii', 'ignore').decode('ascii').strip()
                price_value = float(cleaned_price)
                
                # Availability
                availability = book.find('p', class_='instock availability').text.strip()
                
                # Rating
                rating_tag = book.find('p', class_='star-rating')
                rating_classes = rating_tag.get('class', [])
                rating = rating_classes[1] if len(rating_classes) > 1 else None
                
                all_titles.append(title)
                all_prices.append(price_value)
                all_availability.append(availability)
                all_ratings.append(rating)
        else:
            print(f"Failed to retrieve page {page}")
        time.sleep(1)
    
    data = {
        'title': all_titles,
        'price': all_prices,
        'availability': all_availability,
        'rating': all_ratings
    }
    
    df = pd.DataFrame(data)
    return df


In [None]:

def clean_data(df):
    """
    Cleans and transforms the scraped DataFrame.
    - Maps textual ratings to numeric values
    - Checks for missing values
    
    Parameters:
        df (pd.DataFrame): The raw scraped DataFrame.
        
    Returns:
        pd.DataFrame: A cleaned and transformed DataFrame.
    """
    rating_map = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}
    df['numeric_rating'] = df['rating'].map(rating_map)
    
    missing_values_count = df.isnull().sum()
    print("Missing Values Count before cleanup:")
    print(missing_values_count)
    
    df = df.dropna(subset=['numeric_rating'])
    
    return df


In [None]:

def visualize_data(df):
    """
    Creates basic visualizations for the scraped and cleaned data.
    
    Parameters:
        df (pd.DataFrame): The cleaned DataFrame.
    """
    plt.figure(figsize=(10, 5))
    sns.histplot(df['price'], kde=True)
    plt.title("Distribution of Book Prices")
    plt.xlabel("Price")
    plt.ylabel("Count")
    plt.show()
    
    plt.figure(figsize=(10, 5))
    sns.countplot(x='numeric_rating', data=df, palette='viridis')
    plt.title("Count of Ratings")
    plt.xlabel("Rating (1-5)")
    plt.ylabel("Count")
    plt.show()
    
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='numeric_rating', y='price', data=df, palette='magma')
    plt.title("Price vs. Numeric Rating")
    plt.xlabel("Numeric Rating")
    plt.ylabel("Price")
    plt.show()


In [None]:

# Step 1: Scrape the data
df = scrape_books(num_pages=5)

# Step 2: Clean and transform the data
df = clean_data(df)

# Step 3: Explore the cleaned data
print("DataFrame Head:")
display(df.head())

print("\nSummary Statistics on Price:")
display(df['price'].describe())

print("\nValue Counts for Ratings:")
display(df['rating'].value_counts())

# Step 4: Visualize the data
visualize_data(df)
