# Generate Q+A pairs

For each chunk in the database, use an LLM (GPT-4o-mini) to generate a question that would plausibly be answered using this chunk. To speed this up, make the API calls in parallel.

Stores these in `generated_questions.json`, since this hits the OpenAI API.

In [51]:
from sqlalchemy import select
import asyncio
import json
from openai import AsyncOpenAI

from src.app_config import app_config
from src.db.models.document import Chunk

client = AsyncOpenAI()

MODEL="gpt-4o-mini"
PROMPT = """Generate a question that can be derived from information in this content:

<content>
{content}
</content>

Do not generate a question that cannot be answered by the content."""

# Crop chunk.content to this length -- 128 is the max_seq_length of distiluse-base-multilingual-cased-v2
# and 5 is about the average word length in our corpus
max_content_length = 128 * 5

# Or, essentially no limit
#max_content_length = 1_000_000

async def generate_qa(
    chunk: Chunk,
    semaphore: asyncio.Semaphore,
) -> str:
    async with semaphore:
        completion =  await client.chat.completions.create(
                model=MODEL,
                messages=[{"role": "user", "content": PROMPT.format(content=chunk.content[:max_content_length])}]
            )
        return completion.choices[0].message.content

async def create_synthetic_questions(
    chunks: list[Chunk],
    max_concurrency: int = 10,
) -> list[str]:
    semaphore = asyncio.Semaphore(max_concurrency)
    tasks = [
        generate_qa(chunk, semaphore)
        for chunk in chunks
    ]
    return await asyncio.gather(*tasks, return_exceptions=True)

with app_config.db_session() as db_session:
    chunks = db_session.execute(select(Chunk).limit(500)).scalars().all()
    questions = await create_synthetic_questions(chunks)
    content_and_questions = list(zip([c.content[:max_content_length] for c in chunks], questions))
    
    with open(f"translation/generated_questions_{max_content_length}.json", "w") as file:
        json.dump(content_and_questions, file, indent=2)

database connection is not using SSL


# Evaluate retrieval from stored texts and questions

Load the chunk texts and questions from the file, create embeddings for each, and calculate recall.

In [44]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search

def generate_embeddings(model_name, chunk_contents, questions):
    model = SentenceTransformer(model_name)
    return model.encode(chunk_contents), model.encode(questions)

def compute_recall(question_embeddings, content_embeddings, top_k=5):

    # Search Top K for each question
    search_results = semantic_search(question_embeddings, content_embeddings, top_k=top_k)
    
    # Was the chunk_index found in the results?
    # Note that chunk_index is the same as question_index because chunk_contents[i] is chunk corresponding with questions[i]
    # search_result is a list of the top k most similar entries in chunk_contents for each question in questions
    # in other words, it has dimensions [len(questions)][k]
    found_content = [
        chunk_index in [result['corpus_id'] for result in search_result]
    for chunk_index, search_result in enumerate(search_results)]

    return sum(found_content)/len(found_content)

In [53]:
# Slow!

with open("translation/generated_questions_640.json", "r") as file:
    content_and_questions = json.load(file)

chunk_contents = [cq_pair[0] for cq_pair in content_and_questions]
questions = [cq_pair[1] for cq_pair in content_and_questions]

content_embeddings, question_embeddings = generate_embeddings("multi-qa-mpnet-base-cos-v1", chunk_contents, questions)
r = compute_recall(content_embeddings, question_embeddings)
print("MPNet recall: ", r)

content_embeddings, question_embeddings = generate_embeddings("distiluse-base-multilingual-cased-v2", chunk_contents, questions)
r = compute_recall(content_embeddings, question_embeddings)
print("Distiluse recall: ", r)

MPNet recall:  0.864
Distiluse recall:  0.826


In [50]:
with open("translation/generated_questions_1000000.json", "r") as file:
    content_and_questions = json.load(file)

chunk_contents = [cq_pair[0] for cq_pair in content_and_questions]
questions = [cq_pair[1] for cq_pair in content_and_questions]

content_embeddings, question_embeddings = generate_embeddings("multi-qa-mpnet-base-cos-v1", chunk_contents, questions)
r = compute_recall(content_embeddings, question_embeddings)
print("MPNet recall: ", r)

content_embeddings, question_embeddings = generate_embeddings("distiluse-base-multilingual-cased-v2", chunk_contents, questions)
r = compute_recall(content_embeddings, question_embeddings)
print("Distiluse recall: ", r)

MPNet recall:  0.772
Distiluse recall:  0.706
