In [6]:
import requests
from bs4 import BeautifulSoup
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

# Base URL for campaign documents with pagination parameter
base_url = "https://www.presidency.ucsb.edu/documents/app-categories/elections-and-transitions/campaign-documents?items_per_page={}&page={}"

items_per_page = 1000

# List to store all documents
documents = []

# Function to fetch and parse each document page
def fetch_document(doc_url, session):
    headers = {"Accept-Encoding": "gzip, deflate"}
    doc_response = session.get(doc_url, headers=headers)
    doc_soup = BeautifulSoup(doc_response.text, 'html.parser')

    # Extract data with error handling
    speaker = doc_soup.select_one("div.field-docs-person a")
    speaker = speaker.text.strip() if speaker else ""

    date = doc_soup.select_one("span.date-display-single")
    date = date.text.strip() if date else ""

    title = doc_soup.select_one("div.field-ds-doc-title h1")
    title = title.text.strip() if title else ""

    location = doc_soup.select_one("div.field-spot-state")
    location = location.text.strip() if location else ""

    content_paragraphs = doc_soup.select("div.field-docs-content p")
    text = "\n".join(paragraph.text.strip() for paragraph in content_paragraphs) if content_paragraphs else ""

    # Return structured data
    return {
        "speaker": speaker,
        "date": date,
        "title": title,
        "location": location,
        "text": text
    }

# Function to scrape each main page to gather document links
def scrape_page(page, session):
    headers = {"Accept-Encoding": "gzip, deflate"}
    print(f"Scraping page {page}...")
    url = base_url.format(items_per_page, page)
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all document links on the current page
    document_links = soup.select("div.field-title a")
    doc_urls = ["https://www.presidency.ucsb.edu" + link['href'] for link in document_links]
    return doc_urls

# Scraping pages to gather all document URLs
all_doc_urls = []
with requests.Session() as session:
    for page in range(0, 1):  # Increase the range if there are more pages
        doc_urls = scrape_page(page, session)
        if not doc_urls:
            print("No more documents found, stopping.")
            break
        all_doc_urls.extend(doc_urls)

# Using ThreadPoolExecutor to fetch document data concurrently
with ThreadPoolExecutor(max_workers=20) as executor:
    with requests.Session() as session:
        future_to_url = {executor.submit(fetch_document, url, session): url for url in all_doc_urls}
        for future in as_completed(future_to_url):
            try:
                document_data = future.result()
                documents.append(document_data)
            except Exception as e:
                print(f"Error scraping document: {future_to_url[future]} - {e}")

# Save to JSON file
with open("campaign_documents.json", "w") as json_file:
    json.dump(documents, json_file, indent=4)

print(f"Scraped {len(documents)} documents.")
print("Scraping complete!")


Scraping page 0...
Scraped 1000 documents.
Scraping complete!
