Web scraping is the process of extracting data from websites. Beautiful Soup is a powerful Python library that makes parsing HTML and XML documents easy.

## Setup and Imports

In [3]:
!pip install bs4 requests pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl.metadata (34 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.2.3-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Practical Web Scraping Example: Scraping Jumia Morocco

In [2]:
# Web Scraping Tutorial: Mastering BeautifulSoup for Data Extraction

## Introduction to Web Scraping with BeautifulSoup

import requests
from bs4 import BeautifulSoup
import pandas as pd

# First, let's ensure we have the necessary libraries installed
# You can install them using:
# pip install requests beautifulsoup4 pandas

### Understanding HTML Structure and BeautifulSoup Basics

def explain_html_parsing():
    """
    Demonstration of BeautifulSoup's core parsing capabilities
    """
    # Sample HTML to demonstrate parsing techniques
    sample_html = """
    <html>
        <body>
            <div class="product" id="item1">
                <h2 class="product-title">Smartphone X</h2>
                <p class="price">$499.99</p>
                <span class="brand">TechBrand</span>
            </div>
            <div class="product" id="item2">
                <h2 class="product-title">Laptop Pro</h2>
                <p class="price">$899.99</p>
                <span class="brand">ComputerCo</span>
            </div>
        </body>
    </html>
    """
    
    # Create BeautifulSoup object

    soup = BeautifulSoup(sample_html, 'html.parser')
    
    # Finding elements by tag
    all_divs = soup.find_all('div')
    print("All div elements:", [div.get('id') for div in all_divs])
    
    # Finding elements by class
    product_titles = soup.find_all(class_='product-title')
    product_titles
    print("Product Titles:", [title.text for title in product_titles])
    
    # Finding elements by ID
    specific_product = soup.find(id='item1')
    print("Specific Product Title:", specific_product.find('h2').text)
    
    '''# Nested searching
    prices = [div.find('p', class_='price').text for div in soup.find_all('div', class_='product')]
    print("Product Prices:", prices)'''






In [3]:
# Practical Web Scraping Example: Scraping Jumia Morocco

def scrape_jumia_products():
    """
    Web scraping example from Jumia Morocco
    Demonstrates real-world web scraping techniques
    """
    # Headers to mimic browser request and avoid potential blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
    }
    
    # URL of Jumia Morocco's Electronics category
    url = 'https://www.jumia.ma/catalog/?q=phones-tablets'
    
    try:
        # Send GET request
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Create BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract product information
        products = []
        
        # Find all product articles
        product_articles = soup.find_all('article', class_='prd _fb col c-prd')
        
        for product in product_articles:
            try:
                # Extract product details
                title = product.find('h3', class_='name').text.strip()
                
                # Find price (handling potential missing price)
                price_elem = product.find('div', class_='prc')
                price = price_elem.text.strip() if price_elem else 'Price Not Available'
                
                # Find brand (if available)
                brand_elem = product.find('div', class_='brand')
                brand = brand_elem.text.strip() if brand_elem else 'Unknown Brand'
                
                # Extract product link
                link_elem = product.find('a', class_='core')
                product_link = link_elem['href'] if link_elem else 'No Link'
                
                products.append({
                    'Title': title,
                    'Price': price,
                    'Brand': brand,
                    'Link': product_link
                })
            
            except Exception as detail_error:
                print(f"Error parsing individual product: {detail_error}")
        
        # Convert to DataFrame for easy analysis
        df = pd.DataFrame(products)
        return df
    
    except requests.RequestException as req_error:
        print(f"Request error occurred: {req_error}")
        return None

# Advanced Parsing Techniques

In [4]:
def advanced_parsing_techniques():
    """
    Demonstrates advanced BeautifulSoup parsing methods
    """
    # Sample complex HTML
    complex_html = """
    <div class="catalog">
        <section class="products">
            <article data-category="electronics">
                <h2>Smart Watch</h2>
                <ul class="specs">
                    <li>Color: Black</li>
                    <li>Battery: 500mAh</li>
                </ul>
            </article>
            <article data-category="computers">
                <h2>Gaming Laptop</h2>
                <ul class="specs">
                    <li>RAM: 16GB</li>
                    <li>Processor: Intel i7</li>
                </ul>
            </article>
        </section>
    </div>
    """
    
    soup = BeautifulSoup(complex_html, 'html.parser')
    
    # CSS Selector usage
    electronics = soup.select('article[data-category="electronics"]')
    computers = soup.select('article[data-category="computers"]')
    
    print("Electronics Products:", [elem.find('h2').text for elem in electronics])
    
    # Extracting nested information
    def extract_specs(article):
        return {
            'Name': article.find('h2').text,
            'Specs': [spec.text for spec in article.find_all('li')]
        }
    
    all_products = [extract_specs(article) for article in soup.find_all('article')]
    print("Detailed Products:", all_products)

In [6]:

# Handling Common Web Scraping Challenges

def web_scraping_best_practices():
    """
    Best practices and error handling in web scraping
    """
    # 1. Respect robots.txt
    # 2. Add delays between requests
    # 3. Use proper error handling
    # 4. Rotate User-Agents
    # 5. Handle various HTML structures gracefully
    
    print("Web Scraping Best Practices Demonstrated in Code")

# Main Execution
if __name__ == '__main__':
    # Demonstrate different techniques
    explain_html_parsing()
    
    # Uncomment to run Jumia scraping (be mindful of website's terms of service)
    # jumia_products = scrape_jumia_products()
    # print(jumia_products)
    
   
    #advanced_parsing_techniques()
    #web_scraping_best_practices()

# Note: Always check website's robots.txt and terms of service before scraping
# Ensure you have permission or are complying with legal and ethical guidelines

All div elements: ['item1', 'item2']
Product Titles: ['Smartphone X', 'Laptop Pro']
Specific Product Title: Smartphone X
Product Prices: ['$499.99', '$899.99']
