In [3]:
import requests
from bs4 import BeautifulSoup
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

# Base URL for campaign documents with pagination parameter
base_url = "https://www.presidency.ucsb.edu/documents/app-categories/elections-and-transitions/campaign-documents?items_per_page={}&page={}"

items_per_page = 1000

# List to store all documents
documents = []
collected_documents = 0
total_documents = 23621 # Total number of documents on the website

# Function to fetch and parse each document page
def fetch_document(doc_url, session):
    headers = {"Accept-Encoding": "gzip, deflate"}
    doc_response = session.get(doc_url, headers=headers)
    doc_soup = BeautifulSoup(doc_response.text, 'html.parser')

    # Extract data with error handling
    speaker = doc_soup.select_one("div.field-docs-person a")
    speaker = speaker.text.strip() if speaker else ""

    date = doc_soup.select_one("span.date-display-single")
    date = date.text.strip() if date else ""

    title = doc_soup.select_one("div.field-ds-doc-title h1")
    title = title.text.strip() if title else ""

    location = doc_soup.select_one("div.field-spot-state")
    location = location.text.strip() if location else ""

    content_paragraphs = doc_soup.select("div.field-docs-content p")
    text = "\n".join(paragraph.text.strip() for paragraph in content_paragraphs) if content_paragraphs else ""

    # Return structured data
    return {
        "speaker": speaker,
        "date": date,
        "title": title,
        "location": location,
        "text": text
    }

# Function to scrape each main page to gather document links
def scrape_page(page, session):
    headers = {"Accept-Encoding": "gzip, deflate"}
    print(f"Scraping page {page}...")
    url = base_url.format(items_per_page, page)
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all document links on the current page
    document_links = soup.select("div.field-title a")
    doc_urls = ["https://www.presidency.ucsb.edu" + link['href'] for link in document_links]
    return doc_urls

# Scraping pages to gather all document URLs
all_doc_urls = []
with requests.Session() as session:
    for page in range(21, 24):  # Increase the range if there are more pages
        doc_urls = scrape_page(page, session)
        if not doc_urls:
            print("No more documents found, stopping.")
            break
        all_doc_urls.extend(doc_urls)

with open("campaign_documents.json", "a") as json_file:
    #json_file.write("[")  # Start the JSON array
    
    with ThreadPoolExecutor(max_workers=20) as executor:
        with requests.Session() as session:
            future_to_url = {executor.submit(fetch_document, url, session): url for url in all_doc_urls}
            print(f"Loop started with {len(future_to_url)} documents.")
            for i, future in enumerate(as_completed(future_to_url)):
                try:
                    document_data = future.result()
                    
                    # Write the document data to the file with a comma if it's not the first document
                    if collected_documents > 0:
                        json_file.write(",\n")
                    
                    json.dump(document_data, json_file, indent=4)
                    
                    # Update the document counter
                    collected_documents += 1

                    # Print progress every 1000 documents
                    if collected_documents % 1000 == 0:
                        print(f"Scraped {collected_documents} documents.")
                
                except Exception as e:
                    print(f"Error scraping document: {future_to_url[future]} - {e}")
    
    json_file.write("]")  # End the JSON array

print(f"Scraped {len(documents)} documents.")
print("Scraping complete!")


Scraping page 21...
Scraping page 22...
Scraping page 23...
Loop started with 2621 documents.
Scraped 1000 documents.
Scraped 2000 documents.
Scraped 0 documents.
Scraping complete!
