# Structured Q&A

Source code: https://github.com/mozilla-ai/structured-qa

Docs: https://mozilla-ai.github.io/structured-qa

## Installing dependencies

In [1]:
!git clone --single-branch --branch 5-add-benchmark https://github.com/mozilla-ai/structured-qa

Cloning into 'structured-qa'...
remote: Enumerating objects: 893, done.[K
remote: Counting objects: 100% (331/331), done.[K
remote: Compressing objects: 100% (192/192), done.[K
remote: Total 893 (delta 211), reused 195 (delta 126), pack-reused 562 (from 1)[K
Receiving objects: 100% (893/893), 2.42 MiB | 12.01 MiB/s, done.
Resolving deltas: 100% (493/493), done.


In [2]:
%pip install ./structured-qa

Processing ./structured-qa
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting fire (from structured-qa==0.3.3.dev104+g20f9e3f)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting loguru (from structured-qa==0.3.3.dev104+g20f9e3f)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting pymupdf4llm (from structured-qa==0.3.3.dev104+g20f9e3f)
  Downloading pymupdf4llm-0.0.17-py3-none-any.whl.metadata (4.1 kB)
Collecting rapidfuzz (from structured-qa==0.3.3.dev104+g20f9e3f)
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting streamlit (from structured-qa==0.3.3.dev104+g20f9e3f)
  Downloading streamlit-1.41

# Setup

In [3]:
import os
import google.generativeai as genai
from google.colab.userdata import get, SecretNotFoundError

try:
    genai.configure(api_key=get("GOOGLE_API_KEY"))
except SecretNotFoundError as e:
    raise RuntimeError("Please set the GOOGLE_API_KEY secret to your API key") from e
os.environ["LOGURU_LEVEL"] = "INFO"

In [4]:
from loguru import logger

## Function to Process all questions for a single Section

In [5]:
import json
import time


def process_section_questions(
    section_file,
    section_data,
    model,
):
    logger.info("Predicting")
    answers = {}
    sections = {}
    for index, row in section_data.iterrows():
        if model.n > 0 and model.n % 10 == 0:
            logger.info("Waiting for 60 seconds")
            time.sleep(60)
        question = row["question"]
        logger.info(f"Question: {question}")
        response = model.model.generate_content([section_file.read_text(), question])
        logger.info(response.text)
        response_json = json.loads(response.text)
        answers[index] = response_json["answer"]
        sections[index] = None
        model.n += 1
    return answers, sections

## Load Model

In [6]:
from structured_qa.model_loaders import load_gemini_model

In [7]:
SYSTEM_PROMPT = """
You are a rigorous assistant answering questions.
You must only answer based on the current information available which is:

```
{CURRENT_INFO}
```

If the current information available not enough to answer the question,
you must return "I need more info" srting and nothing else:

If the current information is enough to answer, you must return one of the following formats:
- YES/NO (for boolean questions)
- Number (for numeric questions)
- Single letter (for multiple-choice questions)
"""

In [8]:
model = load_gemini_model("gemini-2.0-flash-exp", system_prompt=SYSTEM_PROMPT)
model.n = 0

# Run Benchmark

In [9]:
from pathlib import Path

import pandas as pd


logger.info("Loading input data")
data = pd.read_csv("structured-qa/benchmark/structured_qa.csv")
data["pred_answer"] = [None] * len(data)
data["pred_section"] = [None] * len(data)

for section_name, section_data in data.groupby("section"):
    section_file = Path(f"structured-qa/benchmark/perfect_context/{section_name}.txt")

    answers, sections = process_section_questions(section_file, section_data, model)

    for index in section_data.index:
        data.loc[index, "pred_answer"] = str(answers[index]).upper()
        data.loc[index, "pred_section"] = sections[index]

data.to_csv("results.csv")

[32m2025-02-03 13:58:31.416[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m6[0m - [1mLoading input data[0m
[32m2025-02-03 13:58:31.445[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_section_questions[0m:[36m10[0m - [1mPredicting[0m
[32m2025-02-03 13:58:31.449[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_section_questions[0m:[36m18[0m - [1mQuestion: In billions, how many trainable parameters does GPT-3 have?[0m
[32m2025-02-03 13:58:33.038[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_section_questions[0m:[36m20[0m - [1m{
  "answer": 175
}[0m
[32m2025-02-03 13:58:33.045[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_section_questions[0m:[36m18[0m - [1mQuestion: Does LoRA introduce additional inference latency compared to full fine-tuning?[0m
[32m2025-02-03 13:58:34.914[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_section_questions[0m:[36m20[0m - [1m{
  "answer": "No"
}[0m
[32m202

# Results

In [10]:
results = pd.read_csv("results.csv")
results.loc[results["answer"] != results["pred_answer"]]

Unnamed: 0.1,Unnamed: 0,document,section,question,answer,pred_answer,pred_section
33,33,https://arxiv.org/pdf/2201.11903,3.4 Robustness of Chain of Thought,How many annotators provided independent chain...,3,2,
42,42,https://github.com/mozilla-ai/structured-qa/re...,CARD AND TILE COSTS,Can a player pay coins to compensate for missi...,YES,NO,


In [11]:
accuracy = sum(results["answer"] == results["pred_answer"]) / len(results)
accuracy

0.9805825242718447