<a href="https://colab.research.google.com/github/maxbilling/AgentspaceEvaluation/blob/main/QuestionGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automatic Question Answer Pair Generation from GCS source files

This Python script automates the creation of a question-and-answer dataset from PDF files stored in Google Cloud Storage (GCS). It iterates through each PDF in a specified GCS bucket, using the Vertex AI Gemini Pro model to read the document's content and generate two context-specific questions and their corresponding answers. The script then compiles these generated pairs, along with a direct URI link to the source PDF, into a single CSV file. Finally, it uploads this CSV back to the GCS bucket, providing a ready-to-use dataset for evaluating search or question-answering systems.

In [None]:
!pip install google-cloud-storage google-generativeai

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user(project_id=project)

In [None]:
import os
import io
import csv
import datetime
import google.auth

# Import the necessary Google Cloud libraries
from google.cloud import storage
import vertexai
from vertexai.generative_models import GenerativeModel, Part


# --- Configuration ---
# ⚠️ CHANGE these to match your GCP environment and bucket
GCP_PROJECT_ID = "" # @param
GCP_REGION = "europe-west1" # Or your specific region
BUCKET_NAME = ""# @param

# The folder where the PDFs are stored
PDF_FOLDER = "wikipedia-articles/"

# The name for the output CSV file
CSV_OUTPUT_FILENAME = "input_queries.csv"


GEMINI_PROMPT = """
Based on the Wikipedia article attached, generate two questions that can be answered with the information in the PDF.
The questions should not be generic but detailed enough that they can be answered only with the information in the PDF.
Respond ONLY in the following format, with each Q&A pair on a new line:
search_query;expected_answer
Ensure that question and answer are always filled. If you cannot generate a question and answer pair reponde with "could not genreate question"; "could not generate answer".
"""
# --- End of Configuration ---

def list_pdf_blobs(storage_client, bucket_name, folder_name):
    print(f"🔍 Searching for PDF files in gs://{bucket_name}/{folder_name}...")
    blobs = storage_client.list_blobs(bucket_name, prefix=folder_name)
    pdf_blobs = [blob for blob in blobs if blob.name.lower().endswith(".pdf")]
    print(f"✅ Found {len(pdf_blobs)} PDF files.")
    return pdf_blobs

def generate_qna_from_pdf(model, pdf_blob):
    print(f"🧠 Processing '{pdf_blob.name}' with Vertex AI Gemini...")
    try:
        pdf_content = pdf_blob.download_as_bytes()
        pdf_file_for_api = Part.from_data(data=pdf_content, mime_type="application/pdf")
        request_payload = [GEMINI_PROMPT, pdf_file_for_api]
        response = model.generate_content(request_payload)
        qna_pairs = []
        for line in response.text.strip().split('\n'):
            if ';' in line:
                parts = line.split(';', 1)
                if len(parts) == 2:
                    qna_pairs.append({"search_query": parts[0].strip(), "expected_answer": parts[1].strip()})
        if not qna_pairs:
            print(f"⚠️ Warning: Could not parse Q&A from response for {pdf_blob.name}.")
        else:
            print(f"👍 Successfully generated {len(qna_pairs)} Q&A pairs.")
        return qna_pairs
    except Exception as e:
        print(f"❌ An error occurred while processing {pdf_blob.name}: {e}")
        return []

# MODIFIED FUNCTION
import urllib.parse

def generate_mtls_url(blob):
    """
    Generates a permanent, authenticated mTLS URL for a GCS blob.

    This URL requires the user to authenticate via IAM and mTLS. Spaces in the
    object name are correctly URL-encoded.
    """
    # Get the bucket and object names from the blob object.
    bucket_name = blob.bucket.name
    object_name = blob.name

    # URL-encode the object name to handle spaces (e.g., ' ' -> '%20')
    # and other special characters, while keeping '/' for folder paths.
    encoded_object_name = urllib.parse.quote(object_name, safe='/')

    # Construct the URL using the specified format.
    mtls_url = f"gs://{bucket_name}/{object_name}"

    return mtls_url

def upload_csv_to_gcs(storage_client, bucket_name, destination_path, data):
    if not data:
        print("No data to upload. Skipping CSV creation.")
        return
    string_io = io.StringIO()
    fieldnames = ['search_query', 'expected_answer', 'golden_url']
    writer = csv.DictWriter(string_io, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)
    csv_data = string_io.getvalue()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_path)
    blob.upload_from_string(csv_data, content_type='text/csv')
    print(f"\n✅ Successfully uploaded results to gs://{bucket_name}/{destination_path}")


def main():

    credentials, project = google.auth.default()

    try:
        vertexai.init(project=GCP_PROJECT_ID, location=GCP_REGION)
        model = GenerativeModel("gemini-2.5-flash")
        storage_client = storage.Client()

        pdf_blobs = list_pdf_blobs(storage_client, BUCKET_NAME, PDF_FOLDER)
        if not pdf_blobs:
            return

        all_results = []
        for i, blob in enumerate(pdf_blobs):
            print("-" * 50)
            print(f"Processing file {i+1} of {len(pdf_blobs)}")
            qna_list = generate_qna_from_pdf(model, blob)
            if qna_list:
                # MODIFIED FUNCTION CALL
                golden_url = generate_mtls_url(blob)
                for item in qna_list:
                    all_results.append({**item, "golden_url": golden_url})

        csv_destination_path = f"{PDF_FOLDER.strip('/')}/{CSV_OUTPUT_FILENAME}"
        upload_csv_to_gcs(storage_client, BUCKET_NAME, csv_destination_path, all_results)

    except Exception as e:
        print(f"\nAn unexpected error occurred during initialization or execution: {e}")

# --- Run the main function ---
main()

