In [1]:
!pip install requests beautifulsoup4



In [6]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
from collections import deque
import os
import re
import unicodedata


def extract_text_content(url):
    """
    Extract all text content from a web page using Beautiful Soup

    Args:
        url (str): The URL of the page to scrape

    Returns:
        tuple: (text_content, page_title, links)

    Note:
        Normalizes all line endings to Unix-style '\n' and removes unusual line terminators
    """
    try:
        # Send request to the URL
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise exception for bad status codes

        # Parse HTML with Beautiful Soup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract page title
        title = soup.title.text.strip() if soup.title else "No title found"

        # Extract all text
        # Remove script and style elements that might contain text not meant for display
        for script_or_style in soup(['script', 'style', 'meta', 'noscript']):
            script_or_style.decompose()

        # Get all text
        text_content = []

        # Add the title first - ensure no unusual characters in title
        clean_title = unicodedata.normalize('NFKD', title)
        clean_title = re.sub(r'[\r\n\u2028\u2029]', ' ', clean_title)  # Replace line breaks with spaces

        text_content.append(f"TITLE: {clean_title}\n")
        text_content.append(f"URL: {url}\n")
        text_content.append("-" * 50 + "\n\n")

        # Extract headings with hierarchy
        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            for heading in soup.find_all(tag):
                if heading.text.strip():
                    # Add heading with appropriate level indicator
                    level_indicator = "#" * int(tag[1])
                    # Clean heading text of unusual line terminators
                    clean_heading = unicodedata.normalize('NFKD', heading.text.strip())
                    clean_heading = re.sub(r'[\r\n\u2028\u2029]', ' ', clean_heading)
                    text_content.append(f"{level_indicator} {clean_heading}\n\n")

        # Extract paragraphs
        for p in soup.find_all('p'):
            if p.text.strip():
                # Clean paragraph text of unusual line terminators
                clean_para = unicodedata.normalize('NFKD', p.text.strip())
                clean_para = re.sub(r'[\r\n\u2028\u2029]', ' ', clean_para)
                text_content.append(f"{clean_para}\n\n")

        # Get content from div elements that might contain text
        for div in soup.find_all('div'):
            # Only include direct text in divs (not text from child elements)
            direct_text = div.find(string=True, recursive=False)
            if direct_text and direct_text.strip():
                # Clean direct text of unusual line terminators
                clean_text = unicodedata.normalize('NFKD', direct_text.strip())
                clean_text = re.sub(r'[\r\n\u2028\u2029]', ' ', clean_text)
                text_content.append(f"{clean_text}\n\n")

        # Extract all links
        links = []
        for link in soup.find_all('a'):
            href = link.get('href')
            if href:
                links.append({
                    'text': link.text.strip(),
                    'url': href
                })

        # Join all text content
        full_text = "".join(text_content)

        # Normalize line endings and remove unusual line terminators
        # Replace Windows line endings (\r\n) with Unix line endings (\n)
        full_text = full_text.replace('\r\n', '\n')
        # Replace old Mac line endings (\r) with Unix line endings (\n)
        full_text = full_text.replace('\r', '\n')
        # Remove any other unusual line terminators like Line Separator (LS) or Paragraph Separator (PS)
        full_text = full_text.replace('\u2028', '\n')  # Line Separator (LS)
        full_text = full_text.replace('\u2029', '\n')  # Paragraph Separator (PS)
        # Remove any other control characters except normal whitespace
        full_text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', full_text)

        return full_text, title, links
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return f"Error scraping {url}: {e}", "Error", []


def normalize_url(base_url, href):
    """
    Normalize a URL by resolving relative paths and ensuring it's from the same domain

    Args:
        base_url (str): The base URL for resolving relative paths
        href (str): The URL or path to normalize

    Returns:
        str or None: The normalized URL if valid, None otherwise
    """
    try:
        # Parse the base URL to get the domain
        parsed_base = urllib.parse.urlparse(base_url)
        base_domain = parsed_base.netloc

        # Skip if it's a fragment, javascript, mailto, or other non-HTTP scheme
        if not href or href.startswith('#') or href.startswith('javascript:') or href.startswith('mailto:'):
            return None

        # Resolve the URL (handles relative paths)
        full_url = urllib.parse.urljoin(base_url, href)
        parsed_url = urllib.parse.urlparse(full_url)

        # Only return URLs from the same domain and with HTTP/HTTPS scheme
        if parsed_url.netloc == base_domain and parsed_url.scheme in ['http', 'https']:
            # Remove fragments
            return full_url.split('#')[0]
        return None
    except Exception:
        return None


def create_filename_from_url(url, title):
    """
    Create a safe filename from a URL and title

    Args:
        url (str): The URL
        title (str): The page title

    Returns:
        str: A safe filename
    """
    # Extract domain name
    parsed_url = urllib.parse.urlparse(url)
    domain = parsed_url.netloc

    # Extract path and remove trailing slash if present
    path = parsed_url.path.rstrip('/')

    # Replace special characters in the path with underscores
    path = re.sub(r'[^a-zA-Z0-9]', '_', path)

    # Clean the title
    clean_title = re.sub(r'[^a-zA-Z0-9]', '_', title)
    clean_title = re.sub(r'_+', '_', clean_title)  # Replace multiple underscores with a single one
    clean_title = clean_title[:50]  # Limit title length

    # Build the filename
    if path:
        filename = f"{domain}{path}_{clean_title}.txt"
    else:
        filename = f"{domain}_{clean_title}.txt"

    # Ensure filename doesn't have double underscores
    filename = re.sub(r'_+', '_', filename)

    # Ensure filename isn't excessively long
    if len(filename) > 100:
        filename = filename[:100] + ".txt"

    return filename


def crawl_website(start_url, max_pages=100):
    """
    Crawl a website starting from a given URL, up to a maximum number of pages

    Args:
        start_url (str): The URL to start crawling from
        max_pages (int): Maximum number of pages to crawl

    Returns:
        list: List of dictionaries containing data from all crawled pages
    """
    # Parse the start URL to get the domain
    parsed_url = urllib.parse.urlparse(start_url)
    domain = parsed_url.netloc

    # Create a directory for the results if it doesn't exist
    domain_dir = domain.replace(':', '_').replace('/', '_')
    os.makedirs(f'text_results/{domain_dir}', exist_ok=True)

    # Initialize the queue with the start URL
    queue = deque([start_url])

    # Set to keep track of visited URLs
    visited = set([start_url])

    # Counter for the number of pages crawled
    pages_crawled = 0

    print(f"Starting crawl of {domain} from {start_url}")

    while queue and pages_crawled < max_pages:
        # Get the next URL from the queue
        current_url = queue.popleft()

        print(f"Crawling {current_url} ({pages_crawled + 1}/{max_pages})")

        # Extract content from the page
        text_content, title, links = extract_text_content(current_url)

        if text_content:
            # Create a filename for this page
            filename = create_filename_from_url(current_url, title)

            # Save the text content to a file with standardized line endings
            with open(f'text_results/{domain_dir}/{filename}', 'w', encoding='utf-8', newline='\n') as f:
                f.write(text_content)

            print(f"Saved text to text_results/{domain_dir}/{filename}")

            # Increment the counter
            pages_crawled += 1

            # Add new links to the queue
            for link in links:
                href = link['url']
                normalized_url = normalize_url(current_url, href)

                if normalized_url and normalized_url not in visited:
                    queue.append(normalized_url)
                    visited.add(normalized_url)

        # Be nice to the server
        time.sleep(1)

    print(f"Finished crawling {domain}, visited {pages_crawled} pages")
    return pages_crawled


def crawl_multiple_sites(urls, max_pages_per_site=50):
    """
    Crawl multiple websites from a list of starter URLs

    Args:
        urls (list): List of URLs to start crawling from
        max_pages_per_site (int): Maximum number of pages to crawl per site

    Returns:
        dict: Dictionary mapping domains to their crawled content
    """
    # Create main directory for results
    os.makedirs('text_results', exist_ok=True)

    results = {}

    for url in urls:
        try:
            # Parse the URL to get the domain
            parsed_url = urllib.parse.urlparse(url)
            domain = parsed_url.netloc

            print(f"\n{'='*40}\nStarting crawl of {domain}\n{'='*40}")

            # Crawl the website
            pages_crawled = crawl_website(url, max_pages=max_pages_per_site)

            # Store the results
            results[domain] = pages_crawled

            print(f"Crawled {pages_crawled} pages from {domain}")
        except Exception as e:
            print(f"Error crawling {url}: {e}")

    return results


# Define your list of URLs to crawl here
urls_to_crawl = [
    "https://www.pittsburghsymphony.org/",
    "https://pittsburghopera.org/",
    "https://trustarts.org/",
    "https://carnegiemuseums.org/",
    "https://www.heinzhistorycenter.org/",
    "https://www.thefrickpittsburgh.org/",
    "https://www.visitpittsburgh.com/events-festivals/food-festivals/",
    "https://www.picklesburgh.com/",
    "https://www.pghtacofest.com/",
    "https://pittsburghrestaurantweek.com/",
    "https://littleitalydays.com/",
    "https://bananasplitfest.com/"
]

# Set the maximum number of pages to crawl per site
max_pages_per_site = 50

# Call the function with your list of URLs
results = crawl_multiple_sites(urls_to_crawl, max_pages_per_site=max_pages_per_site)

print("\nCrawl complete!")
print("Summary of pages crawled:")
for domain, count in results.items():
    print(f"  - {domain}: {count} pages")
print(f"Results saved to the 'text_results' directory")


Starting crawl of www.pittsburghsymphony.org
Starting crawl of www.pittsburghsymphony.org from https://www.pittsburghsymphony.org/
Crawling https://www.pittsburghsymphony.org/ (1/50)
Error scraping https://www.pittsburghsymphony.org/: 403 Client Error: HTTP Forbidden for url: https://www.pittsburghsymphony.org/?queueittoken=e_trustsitewide~ts_1741729657~ce_true~rt_safetynet~h_c8a0f2f8daff8d0b68a7e4377ea5622347f5135beb8621f7cae9427bbfa42a31
Saved text to text_results/www.pittsburghsymphony.org/www.pittsburghsymphony.org_Error.txt
Finished crawling www.pittsburghsymphony.org, visited 1 pages
Crawled 1 pages from www.pittsburghsymphony.org

Starting crawl of pittsburghopera.org
Starting crawl of pittsburghopera.org from https://pittsburghopera.org/
Crawling https://pittsburghopera.org/ (1/50)
Saved text to text_results/pittsburghopera.org/pittsburghopera.org_Home_Pittsburgh_Opera.txt
Crawling https://pittsburghopera.org?hsLang=en (2/50)
Saved text to text_results/pittsburghopera.org/pitt

In [7]:
!zip -r music-and-culture-text-data.zip /content/text_results

  adding: content/text_results/ (stored 0%)
  adding: content/text_results/trustarts.org/ (stored 0%)
  adding: content/text_results/trustarts.org/trustarts.org_Error.txt (deflated 25%)
  adding: content/text_results/www.pittsburghsymphony.org/ (stored 0%)
  adding: content/text_results/www.pittsburghsymphony.org/www.pittsburghsymphony.org_Error.txt (deflated 27%)
  adding: content/text_results/pittsburghrestaurantweek.com/ (stored 0%)
  adding: content/text_results/pittsburghrestaurantweek.com/pittsburghrestaurantweek.com_submit_my_menu_Step_2_Submit_Menu_Pittsburgh_Restaurant_Week.txt (deflated 54%)
  adding: content/text_results/pittsburghrestaurantweek.com/pittsburghrestaurantweek.com_pghrestaurantwk_local_food_bloggers_signup_Calling_All_Pittsburgh_Food_.txt (deflated 51%)
  adding: content/text_results/pittsburghrestaurantweek.com/pittsburghrestaurantweek.com_restaurants_summer_2022_restaurants_Summer_2022_Restaurants_Pittsburgh_.txt (deflated 47%)
  adding: content/text_results/

In [None]:
import json

In [None]:
def extract_content_with_headings(json_file_path, output_text_file):
    """
    Extract content with headings from a JSON file and save as a structured text corpus
    """
    # Load the JSON data
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Extract paragraphs and headings
    paragraphs = data.get('paragraphs', [])
    headings = data.get('headings', [])

    # Sort headings and paragraphs in a combined list
    # This is a simplified approach - in reality you'd need to determine
    # the actual position of each element in the original HTML

    # Create the corpus text
    corpus = ""

    # Add title as first line
    if data.get('title'):
        corpus += data['title'] + "\n\n"

    # Add each paragraph, with headings inserted where appropriate
    # This is a simplified version that just intersperses headings
    heading_index = 0
    for i, paragraph in enumerate(paragraphs):
        # Insert a heading every few paragraphs if available
        if heading_index < len(headings) and i % 3 == 0:
            heading = headings[heading_index]
            corpus += f"# {heading['text']}\n\n"
            heading_index += 1

        # Add the paragraph
        if paragraph and len(paragraph) > 15:
            corpus += paragraph + "\n\n"

    # Write to the output file
    with open(output_text_file, 'w', encoding='utf-8') as f:
        f.write(corpus)

    print(f"Extracted content with {len(headings)} headings and {len(paragraphs)} paragraphs")
    print(f"Saved to {output_text_file}")

In [None]:
# Example usage
if __name__ == "__main__":
    json_file = "page_content.json"  # Path to your JSON file
    output_file = "corpus.txt"  # Path to save the text corpus
    extract_content_with_headings(json_file, output_file)

Extracted content with 4 headings and 12 paragraphs
Saved to corpus.txt


In [None]:
import requests
from bs4 import BeautifulSoup
import json

def extract_and_append_to_corpus(url, corpus_file_path):
    """
    Extract content from a URL and append it to an existing corpus file

    Args:
        url (str): The URL of the page to scrape
        corpus_file_path (str): Path to the existing corpus file
    """
    # Send request to the URL
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for bad status codes
    except Exception as e:
        print(f"Error fetching the URL: {e}")
        return

    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract page title
    title = soup.title.text.strip() if soup.title else "No title found"

    # Extract all paragraphs
    paragraphs = [p.text.strip() for p in soup.find_all('p') if p.text.strip() and len(p.text.strip()) > 15]

    # Prepare content to append
    new_content = f"\n\n=== {title} ===\n\n"
    for paragraph in paragraphs:
        new_content += paragraph + "\n\n"

    # Append to existing corpus file
    try:
        with open(corpus_file_path, 'a', encoding='utf-8') as f:
            f.write(new_content)

        print(f"Added {len(paragraphs)} paragraphs from '{title}'")
        print(f"Added {len(new_content)} characters to {corpus_file_path}")
    except Exception as e:
        print(f"Error appending to corpus file: {e}")

# Example usage
if __name__ == "__main__":
    url = input("Enter a URL to extract content from: ")
    corpus_file = "corpus.txt"  # Path to your existing corpus file
    extract_and_append_to_corpus(url, corpus_file)

Enter a URL to extract content from: https://en.wikipedia.org/wiki/History_of_Pittsburgh
Added 88 paragraphs from 'History of Pittsburgh - Wikipedia'
Added 50444 characters to corpus.txt


In [None]:
import requests
from bs4 import BeautifulSoup
import json

def extract_page_content(url):
    """
    Extract all text content from a web page using Beautiful Soup

    Args:
        url (str): The URL of the page to scrape

    Returns:
        dict: Dictionary containing the page title, paragraphs, and headings
    """
    # Send request to the URL
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise exception for bad status codes

    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract page title
    title = soup.title.text.strip() if soup.title else "No title found"

    # Extract all paragraphs
    paragraphs = [p.text.strip() for p in soup.find_all('p') if p.text.strip()]

    # Extract all headings
    headings = []
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        for heading in soup.find_all(tag):
            if heading.text.strip():
                headings.append({
                    'level': tag,
                    'text': heading.text.strip()
                })

    # Extract all links
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        text = link.text.strip()
        if href and text:
            links.append({
                'text': text,
                'url': href
            })

    # Compile the results
    result = {
        'url': url,
        'title': title,
        'headings': headings,
        'paragraphs': paragraphs,
        'links': links
    }

    return result

# Example usage
if __name__ == "__main__":
    url = input("Enter a URL to scrape: ")
    content = extract_page_content(url)

    # Print summary
    print(f"\nTitle: {content['title']}")
    print(f"Found {len(content['paragraphs'])} paragraphs")
    print(f"Found {len(content['headings'])} headings")
    print(f"Found {len(content['links'])} links")

    # Save the data to a JSON file
    with open('page_content.json', 'w', encoding='utf-8') as f:
        json.dump(content, f, indent=4, ensure_ascii=False)

    print("\nData saved to page_content.json")

Enter a URL to scrape: https://www.britannica.com/place/Pittsburgh

Title: Pittsburgh | Location, History, Teams, Attractions, & Facts | Britannica
Found 10 paragraphs
Found 4 headings
Found 168 links

Data saved to page_content.json


In [None]:
import requests
from bs4 import BeautifulSoup
import json

def extract_page_content(url):
    """
    Extract all text content from a web page using Beautiful Soup

    Args:
        url (str): The URL of the page to scrape

    Returns:
        dict: Dictionary containing the page title, paragraphs, and headings
    """
    # Send request to the URL
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise exception for bad status codes

    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract page title
    title = soup.title.text.strip() if soup.title else "No title found"

    # Extract all paragraphs
    paragraphs = [p.text.strip() for p in soup.find_all('p') if p.text.strip()]

    # Extract all headings
    headings = []
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        for heading in soup.find_all(tag):
            if heading.text.strip():
                headings.append({
                    'level': tag,
                    'text': heading.text.strip()
                })

    # Extract all links
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        text = link.text.strip()
        if href and text:
            links.append({
                'text': text,
                'url': href
            })

    # Compile the results
    result = {
        'url': url,
        'title': title,
        'headings': headings,
        'paragraphs': paragraphs,
        'links': links
    }

    return result

def preview_corpus_content(content, max_paragraphs=5):
    """
    Generate a preview of the content that would be added to the corpus

    Args:
        content (dict): The extracted content dictionary
        max_paragraphs (int): Maximum number of paragraphs to show in preview

    Returns:
        str: Formatted preview text
    """
    preview = f"\n=== {content['title']} ===\n\n"

    # Add first few paragraphs
    paragraphs_to_show = min(max_paragraphs, len(content['paragraphs']))
    for i in range(paragraphs_to_show):
        preview += content['paragraphs'][i] + "\n\n"

    # Add indication if there are more paragraphs
    if len(content['paragraphs']) > max_paragraphs:
        preview += f"[...and {len(content['paragraphs']) - max_paragraphs} more paragraphs...]\n"

    return preview

def add_to_corpus(content, corpus_file, append=True):
    """
    Add content to corpus file

    Args:
        content (dict): The extracted content dictionary
        corpus_file (str): Path to corpus file
        append (bool): Whether to append or overwrite
    """
    mode = 'a' if append else 'w'

    corpus_text = f"\n\n=== {content['title']} ===\n\n"
    for paragraph in content['paragraphs']:
        if paragraph and len(paragraph) > 15:  # Skip very short paragraphs
            corpus_text += paragraph + "\n\n"

    with open(corpus_file, mode, encoding='utf-8') as f:
        f.write(corpus_text)

    print(f"{'Added to' if append else 'Created'} corpus file: {corpus_file}")
    print(f"Added {len(content['paragraphs'])} paragraphs, {len(corpus_text)} characters")

# Example usage
if __name__ == "__main__":
    url = input("Enter a URL to scrape: ")
    content = extract_page_content(url)

    # Print summary
    print(f"\nTitle: {content['title']}")
    print(f"Found {len(content['paragraphs'])} paragraphs")
    print(f"Found {len(content['headings'])} headings")
    print(f"Found {len(content['links'])} links")

    # Preview the content
    preview = preview_corpus_content(content)
    print("\nPREVIEW OF CONTENT TO ADD:")
    print("-" * 50)
    print(preview)
    print("-" * 50)

    # Save the data to a JSON file
    with open('page_content.json', 'w', encoding='utf-8') as f:
        json.dump(content, f, indent=4, ensure_ascii=False)
    print("\nData saved to page_content.json")

    # Ask user if they want to add to corpus
    corpus_file = "corpus.txt"
    add_choice = input(f"\nDo you want to add this content to {corpus_file}? (y/n): ").lower()

    if add_choice == 'y':
        # Check if corpus file exists
        import os
        file_exists = os.path.isfile(corpus_file)

        if file_exists:
            append_choice = input(f"{corpus_file} already exists. Append to it? (y/n): ").lower()
            append = append_choice == 'y'
        else:
            append = False

        add_to_corpus(content, corpus_file, append)

KeyboardInterrupt: Interrupted by user

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import PyPDF2
import io  # For handling in-memory binary data

def extract_page_content(url):
    """Extract all text content from a web page using Beautiful Soup"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.title.text.strip() if soup.title else "No title found"
    paragraphs = [p.text.strip() for p in soup.find_all('p') if p.text.strip()]

    # Extract all headings
    headings = []
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        for heading in soup.find_all(tag):
            if heading.text.strip():
                headings.append({
                    'level': tag,
                    'text': heading.text.strip()
                })

    # Extract all links
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        text = link.text.strip()
        if href and text:
            links.append({
                'text': text,
                'url': href
            })

    result = {
        'source_type': 'web',
        'url': url,
        'title': title,
        'headings': headings,
        'paragraphs': paragraphs,
        'links': links
    }

    return result

def extract_pdf_content(pdf_path_or_url):
    """Extract text content from a PDF file using PyPDF2"""
    is_url = pdf_path_or_url.startswith(('http://', 'https://'))

    try:
        if is_url:
            print(f"Downloading PDF from URL: {pdf_path_or_url}")
            response = requests.get(pdf_path_or_url)
            response.raise_for_status()  # Ensure we got a successful response

            # Get the filename from the URL
            title = pdf_path_or_url.split('/')[-1]

            # Create an in-memory binary stream
            pdf_file = io.BytesIO(response.content)

        else:  # Local file
            if not os.path.exists(pdf_path_or_url):
                print(f"Error: PDF file not found at {pdf_path_or_url}")
                return None

            title = os.path.basename(pdf_path_or_url)
            pdf_file = open(pdf_path_or_url, 'rb')

        # Process the PDF
        reader = PyPDF2.PdfReader(pdf_file)
        print(f"PDF has {len(reader.pages)} pages")

        paragraphs = []
        for page_num in range(len(reader.pages)):
            text = reader.pages[page_num].extract_text()
            if text:
                # Split text into paragraphs based on newlines
                page_paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
                paragraphs.extend(page_paragraphs)

        # Close the file if it's a local file
        if not is_url:
            pdf_file.close()

        print(f"Extracted {len(paragraphs)} text segments from PDF")

        # Clean up paragraphs - remove very short ones, merge related segments
        cleaned_paragraphs = []
        current_paragraph = ""

        for p in paragraphs:
            # Skip likely header/footer patterns, page numbers, etc.
            if len(p) < 20 or p.isdigit():
                continue

            # If it ends with a sentence-ending punctuation, it's a complete thought
            if p.endswith('.') or p.endswith('?') or p.endswith('!'):
                if current_paragraph:
                    current_paragraph += " " + p
                    cleaned_paragraphs.append(current_paragraph)
                    current_paragraph = ""
                else:
                    cleaned_paragraphs.append(p)
            else:
                # It's a partial paragraph or segment, accumulate it
                if current_paragraph:
                    current_paragraph += " " + p
                else:
                    current_paragraph = p

        # Don't forget any remaining text
        if current_paragraph:
            cleaned_paragraphs.append(current_paragraph)

    except Exception as e:
        print(f"Error extracting PDF content: {e}")
        return None

    result = {
        'source_type': 'pdf',
        'file_path': pdf_path_or_url,
        'title': title,
        'paragraphs': cleaned_paragraphs
    }

    return result

def preview_content(content, max_paragraphs=5):
    """Generate a preview of the content that would be added to the corpus"""
    if content['source_type'] == 'web':
        preview = f"\n=== {content['title']} (Web) ===\n\n"
    else:  # PDF
        preview = f"\n=== {content['title']} (PDF) ===\n\n"

    # Add first few paragraphs
    paragraphs_to_show = min(max_paragraphs, len(content['paragraphs']))
    for i in range(paragraphs_to_show):
        preview += content['paragraphs'][i] + "\n\n"

    # Add indication if there are more paragraphs
    if len(content['paragraphs']) > max_paragraphs:
        preview += f"[...and {len(content['paragraphs']) - max_paragraphs} more paragraphs...]\n"

    return preview

def add_to_corpus(content, corpus_file, append=True):
    """Add content to corpus file"""
    mode = 'a' if append else 'w'

    if content['source_type'] == 'web':
        corpus_text = f"\n\n=== {content['title']} (Web: {content['url']}) ===\n\n"
    else:  # PDF
        corpus_text = f"\n\n=== {content['title']} (PDF) ===\n\n"

    for paragraph in content['paragraphs']:
        if paragraph and len(paragraph) > 20:  # Skip very short paragraphs
            corpus_text += paragraph + "\n\n"

    with open(corpus_file, mode, encoding='utf-8') as f:
        f.write(corpus_text)

    print(f"{'Added to' if append else 'Created'} corpus file: {corpus_file}")
    print(f"Added {len(content['paragraphs'])} paragraphs, {len(corpus_text)} characters")

# Example usage
if __name__ == "__main__":
    corpus_file = "corpus.txt"

    # Ask user for source type
    source_type = input("Extract from [1] Website or [2] PDF? (Enter 1 or 2): ").strip()

    if source_type == '1':
        # Extract from website
        url = input("Enter a URL to scrape: ")
        content = extract_page_content(url)
    elif source_type == '2':
        # Extract from PDF
        pdf_path_or_url = input("Enter the path or URL to your PDF file: ")
        content = extract_pdf_content(pdf_path_or_url)
        if not content:
            print("Failed to extract content from PDF. Exiting.")
            exit()
    else:
        print("Invalid option. Exiting.")
        exit()

    # Print summary
    print(f"\nSource: {content['title']}")
    print(f"Found {len(content['paragraphs'])} paragraphs")

    # Preview the content
    preview = preview_content(content)
    print("\nPREVIEW OF CONTENT TO ADD:")
    print("-" * 50)
    print(preview)
    print("-" * 50)

    # Ask user if they want to add to corpus
    add_choice = input(f"\nDo you want to add this content to {corpus_file}? (y/n): ").lower()

    if add_choice == 'y':
        # Check if corpus file exists
        file_exists = os.path.isfile(corpus_file)

        if file_exists:
            append_choice = input(f"{corpus_file} already exists. Append to it? (y/n): ").lower()
            append = append_choice == 'y'
        else:
            append = False

        add_to_corpus(content, corpus_file, append)

    # Save the raw extraction data to a JSON file
    if content['source_type'] == 'web':
        output_file = 'page_content.json'
    else:
        output_file = 'pdf_content.json'

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(content, f, indent=4, ensure_ascii=False)
    print(f"\nRaw extraction data saved to {output_file}")

Extract from [1] Website or [2] PDF? (Enter 1 or 2): 2
Enter the path or URL to your PDF file: https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf
Downloading PDF from URL: https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf
Error extracting PDF content: HTTPSConnectionPool(host='apps.pittsburghpa.gov', port=443): Max retries exceeded with url: /redtail/images/23255_2024_Operating_Budget.pdf (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))
Failed to extract content from PDF. Exiting.


TypeError: 'NoneType' object is not subscriptable

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import PyPDF2
import io  # For handling in-memory binary data

def extract_page_content(url, verify_ssl=True):
    """Extract all text content from a web page using Beautiful Soup"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers=headers, verify=verify_ssl)
        response.raise_for_status()
    except requests.exceptions.SSLError as e:
        if verify_ssl:
            print("SSL verification failed. Trying again without verification...")
            return extract_page_content(url, verify_ssl=False)
        else:
            print(f"Error fetching URL: {e}")
            return None
    except Exception as e:
        print(f"Error fetching URL: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.title.text.strip() if soup.title else "No title found"
    paragraphs = [p.text.strip() for p in soup.find_all('p') if p.text.strip()]

    # Extract all headings
    headings = []
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        for heading in soup.find_all(tag):
            if heading.text.strip():
                headings.append({
                    'level': tag,
                    'text': heading.text.strip()
                })

    # Extract all links
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        text = link.text.strip()
        if href and text:
            links.append({
                'text': text,
                'url': href
            })

    result = {
        'source_type': 'web',
        'url': url,
        'title': title,
        'headings': headings,
        'paragraphs': paragraphs,
        'links': links
    }

    return result

def extract_pdf_content(pdf_path_or_url, verify_ssl=True):
    """Extract text content from a PDF file using PyPDF2"""
    is_url = pdf_path_or_url.startswith(('http://', 'https://'))

    try:
        if is_url:
            print(f"Downloading PDF from URL: {pdf_path_or_url}")
            try:
                response = requests.get(pdf_path_or_url, verify=verify_ssl)
                response.raise_for_status()  # Ensure we got a successful response
            except requests.exceptions.SSLError as e:
                if verify_ssl:
                    print("SSL verification failed. Trying again without verification...")
                    return extract_pdf_content(pdf_path_or_url, verify_ssl=False)
                else:
                    raise e

            # Get the filename from the URL
            title = pdf_path_or_url.split('/')[-1]

            # Create an in-memory binary stream
            pdf_file = io.BytesIO(response.content)

        else:  # Local file
            if not os.path.exists(pdf_path_or_url):
                print(f"Error: PDF file not found at {pdf_path_or_url}")
                return None

            title = os.path.basename(pdf_path_or_url)
            pdf_file = open(pdf_path_or_url, 'rb')

        # Process the PDF
        reader = PyPDF2.PdfReader(pdf_file)
        print(f"PDF has {len(reader.pages)} pages")

        paragraphs = []
        for page_num in range(len(reader.pages)):
            text = reader.pages[page_num].extract_text()
            if text:
                # Split text into paragraphs based on newlines
                page_paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
                paragraphs.extend(page_paragraphs)

        # Close the file if it's a local file
        if not is_url:
            pdf_file.close()

        print(f"Extracted {len(paragraphs)} text segments from PDF")

        # Clean up paragraphs - remove very short ones, merge related segments
        cleaned_paragraphs = []
        current_paragraph = ""

        for p in paragraphs:
            # Skip likely header/footer patterns, page numbers, etc.
            if len(p) < 20 or p.isdigit():
                continue

            # If it ends with a sentence-ending punctuation, it's a complete thought
            if p.endswith('.') or p.endswith('?') or p.endswith('!'):
                if current_paragraph:
                    current_paragraph += " " + p
                    cleaned_paragraphs.append(current_paragraph)
                    current_paragraph = ""
                else:
                    cleaned_paragraphs.append(p)
            else:
                # It's a partial paragraph or segment, accumulate it
                if current_paragraph:
                    current_paragraph += " " + p
                else:
                    current_paragraph = p

        # Don't forget any remaining text
        if current_paragraph:
            cleaned_paragraphs.append(current_paragraph)

    except Exception as e:
        print(f"Error extracting PDF content: {e}")
        return None

    result = {
        'source_type': 'pdf',
        'file_path': pdf_path_or_url,
        'title': title,
        'paragraphs': cleaned_paragraphs
    }

    return result

def preview_content(content, max_paragraphs=5):
    """Generate a preview of the content that would be added to the corpus"""
    if content['source_type'] == 'web':
        preview = f"\n=== {content['title']} (Web) ===\n\n"
    else:  # PDF
        preview = f"\n=== {content['title']} (PDF) ===\n\n"

    # Add first few paragraphs
    paragraphs_to_show = min(max_paragraphs, len(content['paragraphs']))
    for i in range(paragraphs_to_show):
        preview += content['paragraphs'][i] + "\n\n"

    # Add indication if there are more paragraphs
    if len(content['paragraphs']) > max_paragraphs:
        preview += f"[...and {len(content['paragraphs']) - max_paragraphs} more paragraphs...]\n"

    return preview

def add_to_corpus(content, corpus_file, append=True):
    """Add content to corpus file"""
    mode = 'a' if append else 'w'

    if content['source_type'] == 'web':
        corpus_text = f"\n\n=== {content['title']} (Web: {content['url']}) ===\n\n"
    else:  # PDF
        corpus_text = f"\n\n=== {content['title']} (PDF) ===\n\n"

    for paragraph in content['paragraphs']:
        if paragraph and len(paragraph) > 20:  # Skip very short paragraphs
            corpus_text += paragraph + "\n\n"

    with open(corpus_file, mode, encoding='utf-8') as f:
        f.write(corpus_text)

    print(f"{'Added to' if append else 'Created'} corpus file: {corpus_file}")
    print(f"Added {len(content['paragraphs'])} paragraphs, {len(corpus_text)} characters")

# Example usage
if __name__ == "__main__":
    corpus_file = "corpus.txt"

    # Ask user for source type
    source_type = input("Extract from [1] Website or [2] PDF? (Enter 1 or 2): ").strip()

    if source_type == '1':
        # Extract from website
        url = input("Enter a URL to scrape: ")
        content = extract_page_content(url)
    elif source_type == '2':
        # Extract from PDF
        pdf_path_or_url = input("Enter the path or URL to your PDF file: ")
        content = extract_pdf_content(pdf_path_or_url)
        if not content:
            print("Failed to extract content from PDF. Exiting.")
            exit()
    else:
        print("Invalid option. Exiting.")
        exit()

    # Check if content was successfully extracted
    if content is None:
        print("Failed to extract content. Exiting.")
        exit()

    # Print summary
    print(f"\nSource: {content['title']}")
    print(f"Found {len(content['paragraphs'])} paragraphs")

    # Preview the content
    preview = preview_content(content)
    print("\nPREVIEW OF CONTENT TO ADD:")
    print("-" * 50)
    print(preview)
    print("-" * 50)

    # Ask user if they want to add to corpus
    add_choice = input(f"\nDo you want to add this content to {corpus_file}? (y/n): ").lower()

    if add_choice == 'y':
        # Check if corpus file exists
        file_exists = os.path.isfile(corpus_file)

        if file_exists:
            append_choice = input(f"{corpus_file} already exists. Append to it? (y/n): ").lower()
            append = append_choice == 'y'
        else:
            append = False

        add_to_corpus(content, corpus_file, append)

    # Save the raw extraction data to a JSON file
    if content['source_type'] == 'web':
        output_file = 'page_content.json'
    else:
        output_file = 'pdf_content.json'

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(content, f, indent=4, ensure_ascii=False)
    print(f"\nRaw extraction data saved to {output_file}")

Extract from [1] Website or [2] PDF? (Enter 1 or 2): 1
Enter a URL to scrape: https://www.pittsburghpa.gov/Home

Source: Home - Pittsburgh, PA
Found 20 paragraphs

PREVIEW OF CONTENT TO ADD:
--------------------------------------------------

=== Home - Pittsburgh, PA (Web) ===

Published on March 07, 2025
FORBES AVENUE CLOSURE:  FRIDAY, MARCH 7 THROUGH MONDAY, MARCH 10
FORBES AVENUE CLOSURE:  FRIDAY, MARCH 7 THROUGH MONDAY, MARCH 10
Read Full Story









Published on March 07, 2025
Mayor Gainey proposes LOOP tax relief program to City Council
Mayor Ed Gainey submits legislation to Council creating Groundbreaking Longtime Owner Occupant Tax Exemption Program (LOOP)
Read Full Story









Published on March 06, 2025
Pittsburgh Food Justice Fund Grants Open March 10, 2025!
PITTSBURGH’S FOOD JUSTICE FUND GRASSROOTS GRANTS PROGRAM APPLICATIONS OPEN MARCH 10, 2025
Read Full Story









Published on March 04, 2025
Acting Police Chief Chris Ragland Announces Retirement
Acting Police C