# Structured Q&A

Source code: https://github.com/mozilla-ai/structured-qa

Docs: https://mozilla-ai.github.io/structured-qa

## GPU Check

First, you'll need to enable GPUs for the notebook:

- Navigate to `Edit`→`Notebook Settings`
- Select T4 GPU from the Hardware Accelerator section
- Click `Save` and accept.

Next, we'll confirm that we can connect to the GPU:

In [None]:
import torch

if not torch.cuda.is_available():
    raise RuntimeError("GPU not available")
else:
    print("GPU is available!")

## Installing dependencies

In [None]:
%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl

In [None]:
%pip install ragatouille

In [None]:
%pip install git+https://github.com/mozilla-ai/structured-qa.git@5-add-benchmark

In [None]:
!wget https://raw.githubusercontent.com/mozilla-ai/structured-qa/refs/heads/5-add-benchmark/benchmark/structured_qa.csv

# Setup

In [None]:
import os

os.environ["LOGURU_LEVEL"] = "INFO"

## Function to Download Document

In [None]:
from pathlib import Path
from urllib.request import urlretrieve

from loguru import logger


def download_document(url, output_file):
    if not Path(output_file).exists():
        urlretrieve(url, output_file)
        logger.info(f"Downloaded {url} to {output_file}")
    else:
        logger.info(f"File {output_file} already exists")

## Function to Process a single Document

In [None]:
from ragatouille import RAGPretrainedModel


ANSWER_WITH_TYPE_PROMPT = """
You are a rigorous assistant answering questions.
You only answer based on the current information available.
You should only answer with ANSWER_TYPE.

The current information available is:

{CURRENT_INFO}

If the current information available not enough to answer the question,
you must return the following message and nothing else:

```
I need more info.
```
"""


def process_document(
    document_file,
    document_data,
    model,
    answer_prompt=ANSWER_WITH_TYPE_PROMPT,
):
    logger.info("Setting up RAG")
    RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
    RAG.index([document_file])

    logger.info("Predicting")
    answers = {}
    sections = {}
    for index, row in document_data.iterrows():
        question = row["question"]
        try:
            float(row["answer"])
            answer_type = "a number"
        except ValueError:
            if row["answer"] in ("YES", "NO"):
                answer_type = "YES or NO"
            else:
                answer_type = "a single letter"

            answer_prompt = answer_prompt.replace("ANSWER_TYPE", answer_type)

        logger.info(f"Question: {question}")
        logger.info("RAG search")
        results = RAG.search(query=question, k=3)

        current_info = "\n".join(result["content"] for result in results)
        messages = [
            {
                "role": "system",
                "content": answer_prompt.format(CURRENT_INFO=current_info),
            },
            {"role": "user", "content": question},
        ]
        answer = model.get_response(messages)

        answers[index] = answer
        sections[index] = None

    return answers, sections

## Load Model

In [None]:
from structured_qa.model_loaders import load_llama_cpp_model

In [None]:
model = load_llama_cpp_model(
    "bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf"
)

# Run Benchmark

In [None]:
import pandas as pd

logger.info("Loading input data")
data = pd.read_csv("structured_qa.csv")
data["pred_answer"] = [None] * len(data)
data["pred_section"] = [None] * len(data)

for document_link, document_data in data.groupby("document"):
    logger.info(f"Downloading document {document_link}")
    downloaded_document = Path(f"{Path(document_link).name}.pdf")
    download_document(document_link, downloaded_document)

    answers, sections = process_document(downloaded_document, document_data, model)

    for index in document_data.index:
        data.loc[index, "pred_answer"] = str(answers[index]).upper()
        data.loc[index, "pred_section"] = sections[index]

data.to_csv("results.csv")

In [None]:
results = pd.read_csv("results.csv")
results.loc[results["answer"] != results["pred_answer"]]

In [None]:
accuracy = sum(results["answer"] == results["pred_answer"]) / len(results)
accuracy