In [3]:
import requests
from bs4 import BeautifulSoup
import os
import time
import random
import re

# Set test mode - when True, only prints URLs without downloading
TEST_MODE = False

# BBC categories with proper article-containing pages
categories = {
    "technology": "https://www.bbc.com/innovation/technology",
    "innovation": "https://www.bbc.com/innovation",
    "arts": "https://www.bbc.com/arts",
    "travel": "https://www.bbc.com/travel",
    "business": "https://www.bbc.com/business"
}

# Function to get article links from a category page
def get_article_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = []

    # Find all links on the page
    for link in soup.find_all('a', href=True):
        href = link['href']

        # Look for links matching the pattern www.bbc.com/*/articles/*
        if '/articles/' in href:
            # Make absolute URL if needed
            if href.startswith('/'):
                href = 'https://www.bbc.com' + href
            elif not href.startswith('http'):
                href = 'https://www.bbc.com/' + href

            if href not in links and 'bbc.com' in href:
                links.append(href)

    return links[:15]  # Limit to 15 links

# Extract the first segment from URL for filename
def extract_segment(url):
    # Match pattern bbc.com/segment/articles/...
    match = re.search(r'bbc\.com/([^/]+)/articles/', url)
    if match:
        return match.group(1)
    return "unknown"

# Process each category
for category, url in categories.items():
    print(f"Category: {category} - {url}")

    # Create category folder if not in test mode
    if not TEST_MODE:
        os.makedirs(f"bbc_articles/{category}", exist_ok=True)

    # Get article links
    article_links = get_article_links(url)

    # Print or save articles
    count = 0
    for link in article_links:
        # Extract the first segment for the filename
        segment = extract_segment(link)

        if TEST_MODE:
            # Just print the URL and segment
            print(f"  Would download: {link}")
            print(f"  First segment: {segment}")
        else:
            try:
                # Request the article page
                response = requests.get(link)
                time.sleep(random.uniform(1, 3))  # Be nice to the server

                # Create filename using category, segment and count
                filename = f"{category}_{segment}_{count}.html"

                # Save the complete HTML
                with open(f"bbc_articles/{category}/{filename}", "w", encoding="utf-8") as f:
                    f.write(response.text)

                print(f"Saved {filename} - {link}")

            except Exception as e:
                print(f"Error with {link}: {e}")

        count += 1
        if count >= 15:  # Limit to 15 articles per category
            break

    print(f"Completed {category}: {count} article URLs found\n")

Category: technology - https://www.bbc.com/innovation/technology
Saved technology_news_0.html - https://www.bbc.com/news/articles/c5ymvjjqzmeo
Saved technology_news_1.html - https://www.bbc.com/news/articles/c0q1w9q1qvyo
Saved technology_news_2.html - https://www.bbc.com/news/articles/cvgd9v3r69qo
Saved technology_news_3.html - https://www.bbc.com/news/articles/cn524lx9445o
Saved technology_news_4.html - https://www.bbc.com/news/articles/c1kjmm3n427o
Saved technology_news_5.html - https://www.bbc.com/news/articles/cgq90vv9zv5o
Saved technology_news_6.html - https://www.bbc.com/news/articles/clyjv8e49deo
Saved technology_news_7.html - https://www.bbc.com/news/articles/cy87076pdw3o
Saved technology_news_8.html - https://www.bbc.com/news/articles/cwygwnvdq23o
Saved technology_news_9.html - https://www.bbc.com/news/articles/c625z3xgyv1o
Saved technology_news_10.html - https://www.bbc.com/news/articles/c89yxk1egkgo
Saved technology_news_11.html - https://www.bbc.com/news/articles/cy05n9xn1n

In [8]:
import os
import subprocess
import concurrent.futures
import time

# Setup path
base_dir = "./bbc_articles"
categories = ["arts", "business", "innovation", "technology", "travel"]

# Initialize counters for thread-safe counting
from threading import Lock
counter_lock = Lock()
total_files = 0
pdf_converted = 0
html_errors = 0

def convert_html_to_pdf(html_path):
    """Convert an HTML file to PDF using Brave Browser"""
    global pdf_converted, html_errors

    html_file = os.path.basename(html_path)
    category_dir = os.path.dirname(html_path)
    pdf_file = html_file.replace('.html', '.pdf')
    pdf_path = os.path.join(category_dir, pdf_file)

    # Get the absolute path for the file URL
    absolute_path = os.path.abspath(html_path)
    file_url = f"file://{absolute_path}"

    try:
        # Use Brave to convert HTML to PDF
        cmd = [
            "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
            "--headless",
            "--disable-gpu",
            f"--print-to-pdf={pdf_path}",
            file_url
        ]

        print(f"Converting {html_file} to PDF...")
        process = subprocess.run(cmd, capture_output=True, text=True)

        if process.returncode == 0:
            # Remove the original HTML file after successful conversion
            os.remove(html_path)
            with counter_lock:
                global pdf_converted
                pdf_converted += 1
            print(f"✅ Successfully converted {html_file} to PDF")
            return True
        else:
            print(f"❌ Error converting {html_file}: {process.stderr}")
            with counter_lock:
                global html_errors
                html_errors += 1
            return False

    except Exception as e:
        print(f"❌ Exception converting {html_file}: {str(e)}")
        with counter_lock:
            html_errors += 1
        return False

# Collect all HTML files
all_html_files = []
for category in categories:
    category_dir = os.path.join(base_dir, category)
    if not os.path.exists(category_dir):
        continue

    html_files = [os.path.join(category_dir, f) for f in os.listdir(category_dir) if f.endswith('.html')]
    all_html_files.extend(html_files)

total_files = len(all_html_files)
print(f"Found {total_files} HTML files to convert")

# Use a ThreadPoolExecutor for parallel conversion
# Use 4 workers for M1 MacBook Air (adjust as needed)
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Submit all conversion tasks
    future_to_path = {executor.submit(convert_html_to_pdf, html_path): html_path for html_path in all_html_files}

    # Process results as they complete
    for future in concurrent.futures.as_completed(future_to_path):
        html_path = future_to_path[future]
        try:
            future.result()  # Will raise exception if the task failed
        except Exception as e:
            print(f"Task exception: {e}")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"\nSummary:")
print(f"Total HTML files: {total_files}")
print(f"Successfully converted to PDF: {pdf_converted}")
print(f"Errors: {html_errors}")
print(f"Time taken: {elapsed_time:.2f} seconds")

Found 54 HTML files to convert
Converting arts_news_4.html to PDF...
Converting arts_news_8.html to PDF...
Converting arts_news_9.html to PDF...
Converting arts_news_5.html to PDF...
✅ Successfully converted arts_news_8.html to PDF
Converting arts_news_6.html to PDF...
✅ Successfully converted arts_news_4.html to PDF
✅ Successfully converted arts_news_9.html to PDF
Converting arts_news_7.html to PDF...
Converting arts_news_0.html to PDF...
✅ Successfully converted arts_news_5.html to PDF
Converting arts_news_1.html to PDF...
✅ Successfully converted arts_news_6.html to PDF
Converting business_news_0.html to PDF...
✅ Successfully converted arts_news_7.html to PDF
Converting business_news_11.html to PDF...
✅ Successfully converted arts_news_0.html to PDF
Converting business_news_10.html to PDF...
✅ Successfully converted arts_news_1.html to PDF
Converting business_news_1.html to PDF...
✅ Successfully converted business_news_0.html to PDF
Converting business_news_6.html to PDF...
✅ Succes