<a href="https://colab.research.google.com/github/maxbilling/AgentspaceEvaluation/blob/main/Wiki_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 WeasyPrint google-cloud-storage


In [None]:
import os
import requests
from bs4 import BeautifulSoup
from weasyprint import HTML
from google.cloud import storage
import uuid
import time

# --- Configuration ---
# GCP Project and GCS Bucket details
BUCKET_NAME = "wikipedia_source_bucket1213092"  # @param ⚠️ CHANGE THIS to your bucket name

# Number of random articles to scrape
NUMBER_OF_ARTICLES = 40 # @param

# --- End of Configuration ---

def get_random_wiki_page():
    """Fetches the URL of a random English Wikipedia page."""
    random_url = "https://en.wikipedia.org/wiki/Special:Random"
    try:
        with requests.get(random_url, timeout=10) as response:
            response.raise_for_status()
            #print(f"✅ Fetched random page: {response.url}")
            return response.url
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching random page URL: {e}")
        return None

def scrape_and_clean_content(url):
    """Scrapes the main content of a Wikipedia page and cleans it for PDF conversion."""
    try:
        with requests.get(url, timeout=10) as response:
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Get the main content div
            content_div = soup.find(id="mw-content-text")
            if not content_div:
                print("Could not find main content div.")
                return None, None

            # Extract the title of the article
            title = soup.find(id="firstHeading").get_text()

            # Remove unwanted elements like navigation boxes, edit links, etc.
            for element in content_div.find_all(['div', 'span', 'table', 'sup'], class_=['navbox', 'mw-editsection', 'reference', 'reflist']):
                element.decompose()

            # Construct a clean HTML string for the PDF
            # Includes the title and a simple style for better readability
            clean_html = f"""
            <html>
                <head>
                    <meta charset="UTF-8">
                    <style>
                        body {{ font-family: sans-serif; line-height: 1.6; max-width: 800px; margin: auto; padding: 20px; }}
                        h1 {{ color: #333; }}
                        img {{ max-width: 100%; height: auto; }}
                        a {{ color: #0645ad; text-decoration: none; }}
                    </style>
                </head>
                <body>
                    <h1>{title}</h1>
                    {str(content_div)}
                </body>
            </html>
            """

            return clean_html, title

    except requests.exceptions.RequestException as e:
        print(f"❌ Error scraping content from {url}: {e}")
        return None, None

def upload_to_gcs(bucket_name, source_file_object, destination_blob_name):
    """Uploads an in-memory file object to a GCS bucket."""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(destination_blob_name)

        # Reset file pointer to the beginning before uploading
        source_file_object.seek(0)

        blob.upload_from_file(source_file_object, content_type='application/pdf')

        #print(f"📄 Successfully uploaded {destination_blob_name} to gs://{bucket_name}")
        return True
    except Exception as e:
        print(f"❌ Failed to upload to GCS: {e}")
        return False

def main():
    """Main function to orchestrate scraping and uploading."""
    if BUCKET_NAME == "your-gcs-bucket-name":
        print("🚨 Please change the BUCKET_NAME variable in the script!")
        return

    print(f"🚀 Starting scrape of {NUMBER_OF_ARTICLES} articles for bucket '{BUCKET_NAME}'...")

    scraped_count = 0
    while scraped_count < NUMBER_OF_ARTICLES:
        print("-" * 50)
        time.sleep(10)
        page_url = get_random_wiki_page()

        if not page_url:
            continue

        html_content, title = scrape_and_clean_content(page_url)

        if not html_content or not title:
            continue

        # Convert sanitized title into a valid filename
        safe_filename = "".join(c for c in title if c.isalnum() or c in (' ', '_')).rstrip()
        pdf_filename = f"{safe_filename}_{uuid.uuid4().hex[:6]}.pdf"

        try:
            # Generate PDF in memory
            pdf_bytes = HTML(string=html_content).write_pdf()

            # Create an in-memory file-like object
            from io import BytesIO
            pdf_file_object = BytesIO(pdf_bytes)

            # Upload to GCS
            if upload_to_gcs(BUCKET_NAME, pdf_file_object, f"wikipedia-articles/{pdf_filename}"):
                scraped_count += 1
                print(f"Progress: {scraped_count}/{NUMBER_OF_ARTICLES}")

        except Exception as e:
            print(f"❌ An error occurred during PDF conversion or upload for '{title}': {e}")

    print(f"\n🎉 Finished! Scraped and uploaded {scraped_count} articles.")

if __name__ == "__main__":
    main()

In [None]:
!pip install google-cloud-storage google-generativeai




In [None]:
import os
import io
import csv
import datetime
import google.auth

# Import the necessary Google Cloud libraries
from google.cloud import storage
import vertexai
from vertexai.generative_models import GenerativeModel, Part


# --- Configuration ---
# ⚠️ CHANGE these to match your GCP environment and bucket
GCP_PROJECT_ID = "maxbproject"
GCP_REGION = "europe-west1" # Or your specific region
BUCKET_NAME = "wikipedia_source_bucket1213092"

# The folder where the PDFs are stored
PDF_FOLDER = "wikipedia-articles/"

# The name for the output CSV file
CSV_OUTPUT_FILENAME = "input_queries.csv"

# ⚠️ PASTE THE SERVICE ACCOUNT EMAIL you found in Step 1 here
SERVICE_ACCOUNT_EMAIL = "694598770214-compute@developer.gserviceaccount.com"


GEMINI_PROMPT = """
Based on the Wikipedia article attached, generate two questions that can be answered with the information in the PDF.
The questions should not be generic but detailed enough that they can be answered only with the information in the PDF.
Respond ONLY in the following format, with each Q&A pair on a new line:
search_query;expected_answer
Ensure that question and answer are always filled. If you cannot generate a question and answer pair reponde with "could not genreate question"; "could not generate answer".
"""
# --- End of Configuration ---

def list_pdf_blobs(storage_client, bucket_name, folder_name):
    print(f"🔍 Searching for PDF files in gs://{bucket_name}/{folder_name}...")
    blobs = storage_client.list_blobs(bucket_name, prefix=folder_name)
    pdf_blobs = [blob for blob in blobs if blob.name.lower().endswith(".pdf")]
    print(f"✅ Found {len(pdf_blobs)} PDF files.")
    return pdf_blobs

def generate_qna_from_pdf(model, pdf_blob):
    print(f"🧠 Processing '{pdf_blob.name}' with Vertex AI Gemini...")
    try:
        pdf_content = pdf_blob.download_as_bytes()
        pdf_file_for_api = Part.from_data(data=pdf_content, mime_type="application/pdf")
        request_payload = [GEMINI_PROMPT, pdf_file_for_api]
        response = model.generate_content(request_payload)
        qna_pairs = []
        for line in response.text.strip().split('\n'):
            if ';' in line:
                parts = line.split(';', 1)
                if len(parts) == 2:
                    qna_pairs.append({"search_query": parts[0].strip(), "expected_answer": parts[1].strip()})
        if not qna_pairs:
            print(f"⚠️ Warning: Could not parse Q&A from response for {pdf_blob.name}.")
        else:
            print(f"👍 Successfully generated {len(qna_pairs)} Q&A pairs.")
        return qna_pairs
    except Exception as e:
        print(f"❌ An error occurred while processing {pdf_blob.name}: {e}")
        return []

# MODIFIED FUNCTION
import urllib.parse

def generate_mtls_url(blob):
    """
    Generates a permanent, authenticated mTLS URL for a GCS blob.

    This URL requires the user to authenticate via IAM and mTLS. Spaces in the
    object name are correctly URL-encoded.
    """
    # Get the bucket and object names from the blob object.
    bucket_name = blob.bucket.name
    object_name = blob.name

    # URL-encode the object name to handle spaces (e.g., ' ' -> '%20')
    # and other special characters, while keeping '/' for folder paths.
    encoded_object_name = urllib.parse.quote(object_name, safe='/')

    # Construct the URL using the specified format.
    mtls_url = f"gs://{bucket_name}/{object_name}"

    return mtls_url

def upload_csv_to_gcs(storage_client, bucket_name, destination_path, data):
    if not data:
        print("No data to upload. Skipping CSV creation.")
        return
    string_io = io.StringIO()
    fieldnames = ['search_query', 'expected_answer', 'golden_url']
    writer = csv.DictWriter(string_io, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)
    csv_data = string_io.getvalue()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_path)
    blob.upload_from_string(csv_data, content_type='text/csv')
    print(f"\n✅ Successfully uploaded results to gs://{bucket_name}/{destination_path}")


def main():

    if "your-gcp-project-id" in GCP_PROJECT_ID or "your-notebook-service-account" in SERVICE_ACCOUNT_EMAIL:
        print("🚨 Please update the GCP_PROJECT_ID and SERVICE_ACCOUNT_EMAIL variables in the Configuration section!")
        return

    credentials, project = google.auth.default()

    try:
        vertexai.init(project=GCP_PROJECT_ID, location=GCP_REGION)
        model = GenerativeModel("gemini-2.5-flash")
        storage_client = storage.Client()

        pdf_blobs = list_pdf_blobs(storage_client, BUCKET_NAME, PDF_FOLDER)
        if not pdf_blobs:
            return

        all_results = []
        for i, blob in enumerate(pdf_blobs):
            print("-" * 50)
            print(f"Processing file {i+1} of {len(pdf_blobs)}")
            qna_list = generate_qna_from_pdf(model, blob)
            if qna_list:
                # MODIFIED FUNCTION CALL
                golden_url = generate_mtls_url(blob)
                for item in qna_list:
                    all_results.append({**item, "golden_url": golden_url})

        csv_destination_path = f"{PDF_FOLDER.strip('/')}/{CSV_OUTPUT_FILENAME}"
        upload_csv_to_gcs(storage_client, BUCKET_NAME, csv_destination_path, all_results)

    except Exception as e:
        print(f"\nAn unexpected error occurred during initialization or execution: {e}")

# --- Run the main function ---
main()





🔍 Searching for PDF files in gs://wikipedia_source_bucket1213092/wikipedia-articles/...
✅ Found 96 PDF files.
--------------------------------------------------
Processing file 1 of 96
🧠 Processing 'wikipedia-articles/2002 US Open  Womens singles qualifying_a96b60.pdf' with Vertex AI Gemini...
👍 Successfully generated 2 Q&A pairs.
--------------------------------------------------
Processing file 2 of 96
🧠 Processing 'wikipedia-articles/2022 European Speed Skating Championships  Womens team sprint_abd5c1.pdf' with Vertex AI Gemini...
👍 Successfully generated 2 Q&A pairs.
--------------------------------------------------
Processing file 3 of 96
🧠 Processing 'wikipedia-articles/40th Army Soviet Union_4b5e96.pdf' with Vertex AI Gemini...
👍 Successfully generated 2 Q&A pairs.
--------------------------------------------------
Processing file 4 of 96
🧠 Processing 'wikipedia-articles/A Fighting Chance memoir_18155c.pdf' with Vertex AI Gemini...
👍 Successfully generated 2 Q&A pairs.
--------