# Generate Evaluation Questions

Generate questions from random Wikipedia article titles using Cohere LLM.

In [None]:
import os
import random
import yaml
import cohere
from elasticsearch import Elasticsearch
from dotenv import load_dotenv

load_dotenv()

ELASTICSEARCH_ENDPOINT = os.getenv("ELASTICSEARCH_ENDPOINT")
ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME").lower()
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

In [None]:
# Initialize clients
es_client = Elasticsearch(
    ELASTICSEARCH_ENDPOINT,
    api_key=ELASTICSEARCH_API_KEY
)

co = cohere.Client(COHERE_API_KEY)

print(f"Connected to Elasticsearch: {es_client.info()['version']['number']}")
print(f"Index: {INDEX_NAME}")

In [None]:
def get_random_titles(n: int = 1000) -> list[str]:
    """Retrieve n random unique article titles from the index."""
    
    # Use random_score to get random documents, then dedupe titles
    response = es_client.search(
        index=INDEX_NAME,
        body={
            "size": n * 3,  # Oversample to get enough unique titles
            "query": {
                "function_score": {
                    "query": {"match_all": {}},
                    "random_score": {"seed": random.randint(1, 100000), "field": "_seq_no"}
                }
            },
            "_source": ["title"]
        }
    )
    
    # Extract unique titles
    titles = list(set(hit["_source"]["title"] for hit in response["hits"]["hits"]))
    random.shuffle(titles)
    
    return titles[:n]

titles = get_random_titles(1000)
print(f"Retrieved {len(titles)} unique titles")
print(f"\nSample titles:")
for t in titles[:10]:
    print(f"  - {t}")

In [None]:
PROMPT_TEMPLATE = """You are given a list of Wikipedia article titles. For each title, generate exactly 3 questions that someone might ask to find information from that article.

Guidelines:
- Questions should be specific and factual (e.g., "Where was X born?", "When did X start?", "What is the population of X?")
- Questions can also be broad but concrete (e.g., "List popular football players from the 90s", "What are the main exports of X?")
- Do NOT use vague questions like "What's so good about X?" or "Why is X important?"
- Questions should be answerable from a Wikipedia article

Return ONLY a YAML list in this exact format (no other text):

- title: Article Title 1
  question: First question about Article 1?
- title: Article Title 1
  question: Second question about Article 1?
- title: Article Title 1
  question: Third question about Article 1?
- title: Article Title 2
  question: First question about Article 2?
...

Article titles:
{titles}
"""

def generate_questions_batch(titles_batch: list[str]) -> list[dict]:
    """Generate questions for a batch of titles using Cohere."""
    
    titles_str = "\n".join(f"- {t}" for t in titles_batch)
    prompt = PROMPT_TEMPLATE.format(titles=titles_str)
    
    response = co.chat(
        model="command-r-plus",
        message=prompt,
        temperature=0.3
    )
    
    # Parse YAML response
    response_text = response.text.strip()
    
    # Remove markdown code blocks if present
    if response_text.startswith("```"):
        response_text = response_text.split("```")[1]
        if response_text.startswith("yaml"):
            response_text = response_text[4:]
    
    try:
        questions = yaml.safe_load(response_text)
        return questions if questions else []
    except yaml.YAMLError as e:
        print(f"YAML parse error: {e}")
        print(f"Response: {response_text[:500]}")
        return []

In [None]:
# Test with a small batch
test_batch = titles[:3]
print(f"Testing with: {test_batch}")
test_questions = generate_questions_batch(test_batch)
print(f"\nGenerated {len(test_questions)} questions:")
for q in test_questions:
    print(f"  [{q['title']}] {q['question']}")

In [None]:
# Generate questions for all titles in batches of 10
BATCH_SIZE = 10
all_questions = []

for i in range(0, len(titles), BATCH_SIZE):
    batch = titles[i:i + BATCH_SIZE]
    batch_num = i // BATCH_SIZE + 1
    total_batches = (len(titles) + BATCH_SIZE - 1) // BATCH_SIZE
    
    print(f"Processing batch {batch_num}/{total_batches}...")
    
    questions = generate_questions_batch(batch)
    all_questions.extend(questions)
    
    print(f"  Generated {len(questions)} questions (total: {len(all_questions)})")

print(f"\nTotal questions generated: {len(all_questions)}")

In [None]:
# Save questions to YAML file
output_path = "../data/evaluation_questions.yaml"

with open(output_path, "w") as f:
    yaml.dump(all_questions, f, default_flow_style=False, allow_unicode=True)

print(f"Saved {len(all_questions)} questions to {output_path}")

In [None]:
# Preview saved questions
print("Sample questions:")
for q in all_questions[:15]:
    print(f"  [{q['title']}] {q['question']}")