## Comprehensive Web Scraping in Python
Web scraping involves extracting data from websites. Here's a complete guide covering all relevant functions, libraries, and techniques.

### Table of Contents
[Essential Libraries](#Essential-Libraries)

[Basic Scraping with Requests and BeautifulSoup](#Basic-Scraping-with-Requests-and-BeautifulSoup)

[Dynamic Content with Selenium](#Dynamic-Content-with-Selenium)

[API Scraping](#API-Scraping)

[Handling Common Challenges](#Handling-Common-Challenges)

[Best Practices & Ethics](Best_Practices_&_Ethics)

[Complete Examples](#Complete_Examples)





## Essential Libraries

### Core scraping libraries
import requests
from bs4 import BeautifulSoup
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Data handling
import pandas as pd
import json
import csv

### Utilities
import time
import random
import re
from urllib.parse import urljoin, urlparse
import os
from fake_useragent import UserAgent

## Basic Scraping with Requests and BeautifulSoup

1. Making HTTP Requests
```
import requests
from bs4 import BeautifulSoup

def simple_get_request(url):
    """Basic GET request with error handling"""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
```

# Example usage
`html_content = simple_get_request("https://httpbin.org/html")`

2. Advanced Request Configuration

```def advanced_get_request(url, headers=None, params=None, timeout=10):
    """Advanced GET request with headers and parameters"""
    
    # Default headers
    default_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
    }
    
    if headers:
        default_headers.update(headers)
    
    try:
        response = requests.get(
            url,
            headers=default_headers,
            params=params,
            timeout=timeout
        )
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None
```

# Example with parameters and headers
```
url = "https://httpbin.org/get"
params = {'key1': 'value1', 'key2': 'value2'}
headers = {'User-Agent': 'My Custom Agent 1.0'}

response = advanced_get_request(url, headers=headers, params=params)
if response:
    print(response.json())
```

3. BeautifulSoup Parsing

```
def parse_html(html_content, parser='html.parser'):
    """Parse HTML content with BeautifulSoup"""
    soup = BeautifulSoup(html_content, parser)
    return soup

def extract_data(soup):
    """Extract various types of data from parsed HTML"""
    
    # Extract title
    title = soup.title.string if soup.title else "No title"
    
    # Extract all links
    links = []
    for link in soup.find_all('a', href=True):
        links.append({
            'text': link.get_text(strip=True),
            'url': link['href']
        })
    
    # Extract all paragraphs
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
    
    # Extract by CSS class
    specific_elements = [elem.get_text(strip=True) 
                        for elem in soup.find_all(class_='specific-class')]
    
    # Extract by ID
    main_content = soup.find(id='main-content')
    main_text = main_content.get_text(strip=True) if main_content else ""
    
    return {
        'title': title,
        'links': links,
        'paragraphs': paragraphs,
        'specific_elements': specific_elements,
        'main_content': main_text
    }
```

# Complete example
```
def scrape_website(url):
    """Complete scraping function"""
    html = simple_get_request(url)
    if html:
        soup = parse_html(html)
        data = extract_data(soup)
        return data
    return None
```

# Usage
```
data = scrape_website("https://httpbin.org/html")
print(data)
```
4. Finding Elements with BeautifulSoup

```
def comprehensive_element_finding(soup):
    """Demonstrate various ways to find elements"""
    
    # Find by tag name
    all_links = soup.find_all('a')
    all_images = soup.find_all('img')
    
    # Find by class
    news_items = soup.find_all('div', class_='news-item')
    featured = soup.find_all(class_='featured')
    
    # Find by ID
    header = soup.find(id='header')
    footer = soup.find(id='footer')
    
    # Find by attributes
    images_with_alt = soup.find_all('img', alt=True)
    links_with_target = soup.find_all('a', target='_blank')
    
    # Find by text content
    contact_links = soup.find_all('a', string=re.compile('contact', re.I))
    price_elements = soup.find_all(string=re.compile(r'\$\d+'))
    
    # CSS selectors
    navigation_links = soup.select('nav a')
    first_paragraph = soup.select_one('p:first-of-type')
    specific_divs = soup.select('div.container > div.row > div.col')
    
    # Find parent, children, siblings
    if header:
        parent = header.find_parent()
        children = header.find_all(recursive=False)
        next_sibling = header.find_next_sibling()
        previous_sibling = header.find_previous_sibling()
    
    return {
        'all_links': [link.get('href') for link in all_links],
        'images_with_alt': [img.get('alt') for img in images_with_alt],
        'contact_links': [link.get_text() for link in contact_links],
        'navigation_links': [link.get('href') for link in navigation_links]
    }
```


## Basics & Review

`#relevant_tag` use to select  or address id - id attributes are normally unique on a webpage

`.relevant_class` use to select or address classes - class atribute are not uniue for a webpage

### What Beautfulsoup does:

```
from bs4 import beautifulsoup

# this can parse the html

outcome_of_parsing = Beautifulsoup(out_html, the_partsing(here): 'html.parser')
# Stantiating New Object


# with Tag
print(outcome_of_parsing.find('h1')) --> <h1>sssssssssssanythingsssssss.</h1>

# without Tag
print(outcome_of_parsing.find('h1').string) --> 'sssssssssssanythingsssssss.'

def find_title():
  h1_tag = outcome_of_parsing.find('h1')
  print(h1_tag.string)

def find_list_items():
  list_items = outcome_of_parsing.find_all('li')
  print(list_items) # It is a List

or even better

def find_list_items():
  list_items = outcome_of_parsing.find_all('li')
  list_conteent = [e.string for e in list_items]
  print(list_content)

<p class = "subtitle">ssssssssss.sssssssss</p>
def find_para():
  para = outcome_of_parsing.find('p', attrs:{'class': 'subtitle})
  print(para.string)

<p class = "subtitle">ssssssssss.sssssssss</p>
<p>dddddddddddddddddddddd</p>
def find_paras():
  paras = outcome_of_parsing.find_all('p')
  other_para = [par for par in paras if 'subtitle' not in p.attrs.get('class', []) ]
  print(pther_para[0].string)
```

#### Now we need to exctract from tags as well

```
<article product_pod>
....
<h3><a href='ddddd', title='vvvvvv'>....
...
</article>


def get_book_title():
  locator = 'article h3 a' or 'article.product_pod h3 a' # CSS Locator
  ## Below we get the first matching element from the parsed HTML
  item_link = outcome_of_parsing.select_one(locator) # This Function gives CSS Selector (tag, class, id, etc.)
  ## The item_link.attrs return a dictionary of all the class, id,... within the "a" Tag here
  item_name = item_link.attrs['title']

def get_book_link():
  locator = 'article h3 a' or 'article.product_pod h3 a' # CSS Locator
  ## Below we get the first matching element from the parsed HTML
  item_link = outcome_of_parsing.select_one(locator) 
  ## The item_link.attrs return a dictionary of all the class, id,... within the "a" Tag here
  item_name = item_link.attrs['href']


def item_price():
  locator = 'article.product_pod p.price_color'
  the_price = outcome_of_parsing.select_one(locator)
  price_out = the_pice.text().replace("$", "")
  float(price_out)
```
#### Note: 

### ✅ tag.text

Returns all the text content inside the tag, including nested tags.

Always returns a string, even if the tag contains multiple children or complex HTML.

### ⚠️ tag.string

Returns the string only if the tag contains a single NavigableString (i.e., just plain text).

If the tag contains nested tags or multiple text nodes, it returns None.

```
tag = soup.select_one(".note")
print(tag.string)  # Output: Hello
```

`<div class="note">Hello <b>World</b></div>
`
```
tag = soup.select_one(".note")
print(tag.string)  # Output: None
print(tag.text)    # Output: Hello World
```

def book_rate():
  locator = 'article.product_pot p.star_rating'
  item_rate = outcome_of_parsing.select_one(locator) 
  classes_outcome = item_rate.attrs['class']
  list_of_rate = [rt for rt in classes_coutcome of re != "star-rating"]
  or
  list_of_rate = filter(lambda x: X != 'star-rate', classes_outcome)
  print(lis_of_rate[0])



## Structure your files 

1. if you have the page you have file for specifics to extract from the page
Create class of ParseItem and change the print to return 
note
 def __init__(self, page (or portion of a page)):
  self.soup = BeautifulSoup(page, 'html.parser')

remember add @property to avoid brackets for methods that you create in this file


2. Separate locators from the class item (and in a different file)


class ParsedItemLocator:

  NAME_LOCATOR = "article.product_pod h3 a"
  LINK_LOCATOR = "article.product_pod h3 a"
  PRICE_LOCATOR = "article.product_pod p.price_color"
  RATING_LOCATOR = "article.product_pod p.star-rating"



after importing locator for any of method (previous functions) become locator = ParsedItemLocators.NAME_LOCATOR, and so on, and so forth...








## Dynamic Content with Selenium

1. Basic Selenium Setup

```
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def setup_driver(headless=True):
    """Setup Chrome driver with options"""
    chrome_options = Options()
    
    if headless:
        chrome_options.add_argument("--headless")
    
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    # Prevent detection
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver
```

```
def smart_wait(driver, selector, by=By.CSS_SELECTOR, timeout=10):
    """Wait for element to be present and visible"""
    try:
        element = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((by, selector))
        )
        return element
    except Exception as e:
        print(f"Element not found: {selector}")
        return None
```

2. Dynamic Content Scraping


```
def scrape_dynamic_content(url, wait_for_selector=None):
    """Scrape content from JavaScript-heavy websites"""
    driver = setup_driver(headless=True)
    
    try:
        driver.get(url)
        
        # Wait for specific element if provided
        if wait_for_selector:
            smart_wait(driver, wait_for_selector)
        
        # Wait for page to load completely
        time.sleep(2)
        
        # Get page source after JavaScript execution
        page_source = driver.page_source
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Extract data
        data = extract_data(soup)
        
        # Take screenshot for debugging
        driver.save_screenshot('page_screenshot.png')
        
        return data
        
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None
    finally:
        driver.quit()
```
# Example: Scrolling and loading more content

```
def infinite_scroll_scrape(url, scroll_pause_time=2, max_scrolls=10):
    """Handle infinite scroll pages"""
    driver = setup_driver(headless=False)
    
    try:
        driver.get(url)
        
        last_height = driver.execute_script("return document.body.scrollHeight")
        scrolls = 0
        
        while scrolls < max_scrolls:
            # Scroll to bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            
            # Calculate new scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            
            if new_height == last_height:
                break
                
            last_height = new_height
            scrolls += 1
        
        # Now scrape the fully loaded content
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return extract_data(soup)
        
    finally:
        driver.quit()
```


## API Scraping

1. REST API Scraping

```   
def scrape_api(endpoint, headers=None, params=None, method='GET'):
    """Scrape data from REST APIs"""
    
    default_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json',
    }
    
    if headers:
        default_headers.update(headers)
    
    try:
        if method.upper() == 'GET':
            response = requests.get(endpoint, headers=default_headers, params=params)
        elif method.upper() == 'POST':
            response = requests.post(endpoint, headers=default_headers, json=params)
        
        response.raise_for_status()
        
        # Handle different response formats
        content_type = response.headers.get('content-type', '')
        
        if 'application/json' in content_type:
            return response.json()
        else:
            return response.text
            
    except requests.exceptions.RequestException as e:
        print(f"API Error: {e}")
        return None
```

# Example: Paginated API 

```
def scrape_paginated_api(base_url, page_param='page', max_pages=100):
    """Scrape paginated API endpoints"""
    all_data = []
    page = 1
    
    while page <= max_pages:
        params = {page_param: page}
        data = scrape_api(base_url, params=params)
        
        if not data or len(data) == 0:
            break
            
        all_data.extend(data)
        page += 1
        
        # Be respectful
        time.sleep(1)
    
    return all_data
```

## Handling Common Challenges

1. Rate Limiting and Delays

```
import random
import time
from functools import wraps

def respectful_scraping(delay_range=(1, 3)):
    """Decorator to add random delays between requests"""
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Random delay
            delay = random.uniform(delay_range[0], delay_range[1])
            time.sleep(delay)
            return func(*args, **kwargs)
        return wrapper
    return decorator

class RateLimitedScraper:
    """Rate limiting class for multiple requests"""
    
    def __init__(self, requests_per_minute=60):
        self.requests_per_minute = requests_per_minute
        self.delay = 60.0 / requests_per_minute
        self.last_request_time = 0
    
    def wait(self):
        """Wait if necessary to respect rate limit"""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_request_time = time.time()
    
    @respectful_scraping(delay_range=(1, 2))
    def scrape(self, url):
        """Scrape with rate limiting"""
        self.wait()
        return simple_get_request(url)
```

2. Proxy Rotation

```
class ProxyManager:
    """Manage proxy rotation for scraping"""
    
    def __init__(self, proxy_list=None):
        self.proxies = proxy_list or []
        self.current_proxy_index = 0
    
    def get_proxy(self):
        """Get next proxy in rotation"""
        if not self.proxies:
            return None
        
        proxy = self.proxies[self.current_proxy_index]
        self.current_proxy_index = (self.current_proxy_index + 1) % len(self.proxies)
        return proxy
    
    def scrape_with_proxy(self, url):
        """Scrape using current proxy"""
        proxy = self.get_proxy()
        
        if proxy:
            try:
                response = requests.get(url, proxies={'http': proxy, 'https': proxy}, timeout=10)
                return response.text
            except:
                print(f"Proxy failed: {proxy}")
                return None
        else:
            return simple_get_request(url)
```
# Example proxy list
proxies = [
    'http://proxy1:port',
    'http://proxy2:port', 
    'http://proxy3:port'
]

proxy_manager = ProxyManager(proxies)

3. Handling Sessions and Cookies


```
def session_based_scraping(login_url, credentials, target_url):
    """Maintain session across multiple requests"""
    
    with requests.Session() as session:
        # Login
        login_data = {
            'username': credentials['username'],
            'password': credentials['password']
        }
        
        # Get login page first (for CSRF tokens, etc.)
        login_page = session.get(login_url)
        soup = BeautifulSoup(login_page.text, 'html.parser')
        
        # Extract CSRF token if present
        csrf_token = soup.find('input', {'name': 'csrf_token'})
        if csrf_token:
            login_data['csrf_token'] = csrf_token['value']
        
        # Perform login
        login_response = session.post(login_url, data=login_data)
        login_response.raise_for_status()
        
        # Now access protected content
        target_response = session.get(target_url)
        return target_response.text
```




## Data Storage Functions

1. Save to CSV

```
def save_to_csv(data, filename):
    """Save scraped data to CSV"""
    if not data:
        print("No data to save")
        return
    
    # Determine fieldnames
    if isinstance(data, list) and len(data) > 0:
        fieldnames = data[0].keys()
    else:
        fieldnames = data.keys()
        data = [data]
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)
    
    print(f"Data saved to {filename}")
```
2. Save to JSON

```
def save_to_json(data, filename):
    """Save scraped data to JSON"""
    with open(filename, 'w', encoding='utf-8') as jsonfile:
        json.dump(data, jsonfile, ensure_ascii=False, indent=2)
    
    print(f"Data saved to {filename}")
```


3. Save to Database

```
import sqlite3
import pandas as pd

def save_to_sqlite(data, db_file, table_name):
    """Save data to SQLite database"""
    if isinstance(data, list):
        df = pd.DataFrame(data)
    else:
        df = pd.DataFrame([data])
    
    conn = sqlite3.connect(db_file)
    df.to_sql(table_name, conn, if_exists='append', index=False)
    conn.close()
    
    print(f"Data saved to {db_file}.{table_name}")
```

