# Structured Q&A

Source code: https://github.com/mozilla-ai/structured-qa

Docs: https://mozilla-ai.github.io/structured-qa

## GPU Check

First, you'll need to enable GPUs for the notebook:

- Navigate to `Edit`→`Notebook Settings`
- Select T4 GPU from the Hardware Accelerator section
- Click `Save` and accept.

Next, we'll confirm that we can connect to the GPU:

In [None]:
import torch

if not torch.cuda.is_available():
    raise RuntimeError("GPU not available")
else:
    print("GPU is available!")

## Installing dependencies

In [None]:
%pip install ragatouille

In [None]:
%pip install git+https://github.com/mozilla-ai/structured-qa.git@5-add-benchmark

In [None]:
!wget https://raw.githubusercontent.com/mozilla-ai/structured-qa/refs/heads/5-add-benchmark/benchmark/structured_qa.csv

# Setup

In [None]:
import os
import google.generativeai as genai

GEMINI_API_KEY = None
if not GEMINI_API_KEY:
    raise ValueError("Please set the GEMINI_API_KEY variable to your API key")
os.environ["LOGURU_LEVEL"] = "INFO"
genai.configure(api_key=GEMINI_API_KEY)

In [None]:
from loguru import logger

In [None]:
import PyPDF2

def load_pdf(pdf_file: str) -> str | None:
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        return "\n".join(page.extract_text() for page in pdf_reader.pages)
    except Exception as e:
        logger.exception(e)
        return None

## Function to Process a single Document

In [None]:
from ragatouille import RAGPretrainedModel
from ragatouille.data import CorpusProcessor


def process_document(
    document_file,
    document_data,
    model,
):
    logger.info("Setting up RAG")
    RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
    corpus_processor = CorpusProcessor()
    documents = corpus_processor.process_corpus([load_pdf(document_file)])
    RAG.encode([x['content'] for x in documents])

    logger.info("Predicting")
    answers = {}
    sections = {}
    for index, row in document_data.iterrows():
        question = row["question"]

        logger.info(f"Question: {question}")
        logger.info("RAG search")
        results = RAG.search_encoded_docs(query=question, k=3)
        logger.info("RESULTS")
        logger.info(results)
        current_info = "\n".join(result["content"] for result in results)
        logger.info(current_info)

        answer = model.model.generate_content([f"This is the document: {current_info}"], question)
        logger.info(answer)
        answers[index] = answer
        sections[index] = None

    return answers, sections

## Load Model

In [None]:
from structured_qa.model_loaders import load_gemini_model

In [None]:
SYSTEM_PROMPT = """
You are given an input document and a question.
You can only answer the question based on the information in the document.
You will return a JSON name with two keys: "section" and "answer".
In `"section"`, you will return the name of the section where you found the answer.
In `"answer"`, you will return the answer one of the following JSON:
- Yes/No (for boolean questions)
Is the model an LLM?
{
  "section": "1. Introduction",
  "answer": "No"
}
- Single number (for numeric questions)
How many layers does the model have?
{
  "section": "2. Architecture",
  "answer": 12
}
- Single letter (for multiple-choice questions)
What is the activation function used in the model?
-A: ReLU
-B: Sigmoid
-C: Tanh
{
  "section": "2. Architecture",
  "answer": "C"
}
"""

In [None]:
model = load_gemini_model(
    "gemini-2.0-flash-exp",
    system_prompt=SYSTEM_PROMPT,
    generation_config={
        "response_mime_type": "application/json",
    },
)

# Run Benchmark

In [None]:
from pathlib import Path
from urllib.request import urlretrieve

import pandas as pd

logger.info("Loading input data")
data = pd.read_csv("structured_qa.csv")
data["pred_answer"] = [None] * len(data)
data["pred_section"] = [None] * len(data)

for document_link, document_data in data.groupby("document"):
    logger.info(f"Downloading document {document_link}")
    downloaded_document = Path(f"{Path(document_link).name}.pdf")
    if not Path(downloaded_document).exists():
        urlretrieve(document_link, downloaded_document)
        logger.info(f"Downloaded {document_link} to {downloaded_document}")
    else:
        logger.info(f"File {downloaded_document} already exists")

    answers, sections = process_document(downloaded_document, document_data, model)

    for index in document_data.index:
        data.loc[index, "pred_answer"] = str(answers[index]).upper()
        data.loc[index, "pred_section"] = sections[index]

data.to_csv("results.csv")

In [None]:
results = pd.read_csv("results.csv")
results.loc[results["answer"] != results["pred_answer"]]

In [None]:
accuracy = sum(results["answer"] == results["pred_answer"]) / len(results)
accuracy