# QA dataset generation

This notebook contains code to generate a question-answering dataset using a given text corpus. The dataset will consist of questions derived from the text along with their corresponding answers.

The dataset will be created using OpenAI API.

In [1]:
import os
import json
import time
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

In [2]:
# --------------------------------------------------
# Configuration
# --------------------------------------------------
INPUT_CSV = "../data/raw/bmw_press_releases.csv"
OUTPUT_CSV = "../data/QA/article_qa.csv"
OUTPUT_JSON_DIR = "../data/QA/qa_s/"
TEXT_COLUMN = "content"
MODEL = "gpt-4.1-mini"  # cost-efficient and reliable for structured generation)

In [3]:
# --------------------------------------------------
# Load environment variables
# --------------------------------------------------
load_dotenv(dotenv_path='../.env')
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [4]:
# --------------------------------------------------
# Prompt template
# --------------------------------------------------
SYSTEM_PROMPT = (
    "You are a dataset generation assistant. "
    "Given an article, generate exactly one high-quality, "
    "answerable question and answer based solely on the content."
    "The answer should consist of maximum 3 words."
)

USER_PROMPT_TEMPLATE = """
Article:
\"\"\"
{article_text}
\"\"\"

Return the output strictly as JSON in the following format:

{{
    "question": "Your generated question here?",
    "answer": "The corresponding answer here."
}}
    
"""

In [5]:
# --------------------------------------------------
# Helper function
# --------------------------------------------------
def generate_questions(article_text: str) -> list[str]:
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": USER_PROMPT_TEMPLATE.format(article_text=article_text),
            },
        ],
        temperature=0.3,
    )

    content = response.choices[0].message.content
    parsed = json.loads(content)

    return parsed

In [6]:
# Load dataset
df = pd.read_csv(INPUT_CSV)

In [7]:
# For each article, generate questions and store them in json format first - skip if exists
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)
no_articles = 0
for idx, row in df.iterrows():
    article_url = row['url']
    article_name = article_url.split("/")[-1]
    
    filename = os.path.join(OUTPUT_JSON_DIR, f"qa_article_{article_name}.json")
    if os.path.exists(filename):
        print(f"Skipping existing file: {filename}")
        continue
    article_text = row[TEXT_COLUMN]
    qa = {
        "article_title": row['title'],
        "article_url": article_url,
        "question": "",
        "answer": ""
    }
    qa.update(generate_questions(article_text))
    with open(filename, "w") as f:
        json.dump(qa, f, indent=2)
    print(f"Saved QA pairs to {filename}")
    no_articles += 1
    if no_articles >= 5:
        break

Skipping existing file: ../data/QA/qa_s/qa_article_iconic-bmw-art-cars-by-andy-warhol-and-julie-mehretu-are-coming-to-north-america-bmw-art-car-world-tour-stops-at-pebble-beach-concours-d%E2%80%99elegance-and-the-bridge.json
Skipping existing file: ../data/QA/qa_s/qa_article_bmw-group-foerdert-junge-vordenker.json
Skipping existing file: ../data/QA/qa_s/qa_article_turning-old-into-new:-recycling-as-next-step-towards-greater-circular-economy-for-bmw-group-3d-printing.json
Skipping existing file: ../data/QA/qa_s/qa_article_bmw-group-plant-regensburg-pilots-thermal-oil-system-for-heat-generation-in-paint-shop.json
Skipping existing file: ../data/QA/qa_s/qa_article_nelson-piquet-tribute:-bmw-group-classic-brought-legendary-bmw-race-cars-to-the-%E2%80%9Ccircuito-estoril%E2%80%9D-in-honour-of-the-three-time-formula-1-world-champion.json
Skipping existing file: ../data/QA/qa_s/qa_article_indianapolis-8-hour:-kelvin-van-der-linde-wins-the-igtc-drivers%E2%80%99-title-%E2%80%93-bmw-m-motorsport-

In [8]:
# Load all generated QA pairs and save to a single CSV
qa_data = []
for file in os.listdir(OUTPUT_JSON_DIR):
    if file.endswith(".json"):
        with open(os.path.join(OUTPUT_JSON_DIR, file), "r") as f:
            qa_entry = json.load(f)
            qa_data.append(qa_entry)
qa_df = pd.DataFrame(qa_data)
qa_df.to_csv(OUTPUT_CSV, index=False)