<a href="https://colab.research.google.com/github/maxbilling/AgentspaceEvaluation/blob/main/Wiki_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 WeasyPrint google-cloud-storage


In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user(project_id=project)

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from weasyprint import HTML
from google.cloud import storage
import uuid
import time

# --- Configuration ---
# GCP Project and GCS Bucket details
BUCKET_NAME = "source_bucket1213092"  # @param ⚠️ CHANGE THIS to your bucket name

# Number of random articles to scrape
NUMBER_OF_ARTICLES = 40 # @param

# --- End of Configuration ---

def get_random_wiki_page():
    """Fetches the URL of a random English Wikipedia page."""
    random_url = "https://en.wikipedia.org/wiki/Special:Random"
    try:
        with requests.get(random_url, timeout=10) as response:
            response.raise_for_status()
            #print(f"✅ Fetched random page: {response.url}")
            return response.url
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching random page URL: {e}")
        return None

def scrape_and_clean_content(url):
    """Scrapes the main content of a Wikipedia page and cleans it for PDF conversion."""
    try:
        with requests.get(url, timeout=10) as response:
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Get the main content div
            content_div = soup.find(id="mw-content-text")
            if not content_div:
                print("Could not find main content div.")
                return None, None

            # Extract the title of the article
            title = soup.find(id="firstHeading").get_text()

            # Remove unwanted elements like navigation boxes, edit links, etc.
            for element in content_div.find_all(['div', 'span', 'table', 'sup'], class_=['navbox', 'mw-editsection', 'reference', 'reflist']):
                element.decompose()

            # Construct a clean HTML string for the PDF
            # Includes the title and a simple style for better readability
            clean_html = f"""
            <html>
                <head>
                    <meta charset="UTF-8">
                    <style>
                        body {{ font-family: sans-serif; line-height: 1.6; max-width: 800px; margin: auto; padding: 20px; }}
                        h1 {{ color: #333; }}
                        img {{ max-width: 100%; height: auto; }}
                        a {{ color: #0645ad; text-decoration: none; }}
                    </style>
                </head>
                <body>
                    <h1>{title}</h1>
                    {str(content_div)}
                </body>
            </html>
            """

            return clean_html, title

    except requests.exceptions.RequestException as e:
        print(f"❌ Error scraping content from {url}: {e}")
        return None, None

def upload_to_gcs(bucket_name, source_file_object, destination_blob_name):
    """Uploads an in-memory file object to a GCS bucket."""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(destination_blob_name)

        # Reset file pointer to the beginning before uploading
        source_file_object.seek(0)

        blob.upload_from_file(source_file_object, content_type='application/pdf')

        #print(f"📄 Successfully uploaded {destination_blob_name} to gs://{bucket_name}")
        return True
    except Exception as e:
        print(f"❌ Failed to upload to GCS: {e}")
        return False

def main():
    """Main function to orchestrate scraping and uploading."""
    if BUCKET_NAME == "your-gcs-bucket-name":
        print("🚨 Please change the BUCKET_NAME variable in the script!")
        return

    print(f"🚀 Starting scrape of {NUMBER_OF_ARTICLES} articles for bucket '{BUCKET_NAME}'...")

    scraped_count = 0
    while scraped_count < NUMBER_OF_ARTICLES:
        print("-" * 50)
        time.sleep(10)
        page_url = get_random_wiki_page()

        if not page_url:
            continue

        html_content, title = scrape_and_clean_content(page_url)

        if not html_content or not title:
            continue

        # Convert sanitized title into a valid filename
        safe_filename = "".join(c for c in title if c.isalnum() or c in (' ', '_')).rstrip()
        pdf_filename = f"{safe_filename}_{uuid.uuid4().hex[:6]}.pdf"

        try:
            # Generate PDF in memory
            pdf_bytes = HTML(string=html_content).write_pdf()

            # Create an in-memory file-like object
            from io import BytesIO
            pdf_file_object = BytesIO(pdf_bytes)

            # Upload to GCS
            if upload_to_gcs(BUCKET_NAME, pdf_file_object, f"wikipedia-articles/{pdf_filename}"):
                scraped_count += 1
                print(f"Progress: {scraped_count}/{NUMBER_OF_ARTICLES}")

        except Exception as e:
            print(f"❌ An error occurred during PDF conversion or upload for '{title}': {e}")

    print(f"\n🎉 Finished! Scraped and uploaded {scraped_count} articles.")

if __name__ == "__main__":
    main()