In [None]:
"""
q_a_generator.ipynb

This script automates the creation of a question-answer dataset from a collection
of processed documents. It is designed to generate evaluation data for RAG systems

The script iterates through JSON files containing pre-chunked text, groups these chunks
to form coherent contexts, and then uses a locally-run LLM via Ollama to generate
question-answer pairs based on the text. The final output is a single, consolidated .txt
file containing all the generated pairs.
"""

In [1]:
import os
import ollama
import json
from pathlib import Path

In [10]:
# --- Configuration ---
OLLAMA_MODEL = "gemma3:12b"
# Define the base directory of the project to ensure file paths are portable.
BASE_DIR = Path.cwd().parent.parent 
# The source directory containing the processed, chunked JSON files.
INPUT_DIRECTORY = BASE_DIR / "data" / "processed_output"
# The destination file where the final generated Q&A dataset will be saved.
OUTPUT_FILE = BASE_DIR / "evaluation_data"/ "generated_qa_dataset.txt"

In [3]:
def extract_chunks_from_json(file_path):
    """
    Extracts a list of text content from the 'chunks' key in a JSON file.

    This function is designed to parse the output of a document pre-processing
    pipeline where a document is segmented into smaller text chunks.

    Args:
        file_path (str or Path): The full path to the input JSON file.

    Returns:
        list[str] | None: A list of text strings, where each string is the
                          content of a chunk. Returns None if the file cannot
                          be parsed, is not found, or lacks the 'chunks' key.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            # Ensure the "chunks" key exists and is a list before processing.
            if 'chunks' in data and isinstance(data['chunks'], list):
                # Extract the "text" value from each dictionary in the "chunks" list.
                return [chunk.get('text', '') for chunk in data['chunks']]
            else:
                print(f"Warning: JSON file {file_path} lacks the expected 'chunks' list. Skipping.")
                return None
    except json.JSONDecodeError:
        # Handle cases where the file is not valid JSON.
        print(f"Error: Could not decode JSON from {file_path}. Is it a valid JSON file?")
        return None
    except Exception as e:
        # Catch other potential file I/O errors.
        print(f"Error reading JSON file {file_path}: {e}")
        return None


In [6]:
def generate_qa_pairs(document_text, num_questions, ollama_model):
    """
    Generates a specified number of question-answer pairs for a given document
    text using the Ollama library.

    Args:
        document_text (str): The source text from which to generate Q&A pairs.
        num_questions (int): The exact number of Q&A pairs to generate.
        ollama_model (str): The identifier for the Ollama model to use.

    Returns:
        str: A string containing the formatted Q&A pair(s). Returns an
             empty string if generation fails or the input text is empty.
    """
    # Pre-condition check: Do not query the model with empty text.
    if not document_text or not document_text.strip():
        return ""

    # --- Prompt Engineering ---
    prompt = f"""
    Based on the following document, please generate exactly {num_questions} question-answer pair.
    The question should be answerable directly from the text.
    The answer should be concise and extracted from the document.
    Provide the output in a simple text format.

    Example format:
    Q: What is the capital of France?
    A: Paris.

    Document:
    ---
    {document_text}
    ---
    """

    try:
        # API call to the local Ollama service.
        response = ollama.generate(
            model=ollama_model,
            prompt=prompt,
            stream=False
        )

        qa_text = response.get("response", "")

        # Validate the response from the model.
        if qa_text.strip():
             print(f"Successfully generated Q&A pair.")
             return qa_text
        else:
            print("Warning: LLM returned an empty response.")
            return ""

    except ollama.ResponseError as e:
        # Handle API-specific errors, such as a missing model.
        print(f"Error calling Ollama API: {e.error}")
        print(f"Please ensure Ollama is running and the model '{ollama_model}' is available.")
        return ""
    except Exception as e:
        # Handle other unexpected errors (e.g., network issues).
        print(f"An unexpected error occurred: {e}")
        return ""


In [7]:
def process_documents(input_dir, output_file, ollama_model):
    """
    Processes all JSON documents in a directory. For every N chunks of text,
    it generates one Q&A pair and saves the results to a single .txt file.

    Args:
        input_dir (str or Path): The directory containing source JSON files.
        output_file (str or Path): The path to the output text file.
        ollama_model (str): The identifier for the Ollama model.
    """
    if not os.path.exists(input_dir):
        print(f"Error: Input directory '{input_dir}' not found.")
        return

    try:
        # Open the output file in 'write' mode to create a fresh dataset each run.
        with open(output_file, 'w', encoding='utf-8') as f:
            for filename in os.listdir(input_dir):
                if not filename.lower().endswith(".json"):
                    continue # Ignore non-JSON files.

                file_path = os.path.join(input_dir, filename)
                print(f"\\n--- Processing: {filename} ---")
                f.write(f"--- Q&A for: {filename} ---\\n")

                chunks = extract_chunks_from_json(file_path)
                if not chunks:
                    continue # Skip if file was empty or invalid.

                # --- Batching Logic ---
                # Group 10 chunks together to provide the LLM with enough context to
                # form meaningful questions. This is a tunable hyperparameter. A larger
                # number provides more context; a smaller number yields more granular Q&A.
                chunk_group_size = 10
                for i in range(0, len(chunks), chunk_group_size):
                    chunk_group = chunks[i:i + chunk_group_size]
                    combined_text = "\\n\\n".join(chunk_group)

                    if combined_text.strip():
                        print(f"Generating 1 Q&A pair for chunks {i+1}-{i+len(chunk_group)}...")

                        # Generate one Q&A pair for the combined text block.
                        qa_text = generate_qa_pairs(combined_text, num_questions=1, ollama_model=ollama_model)

                        if qa_text:
                            # Write the valid Q&A pair to the output file.
                            f.write(qa_text.strip() + "\\n\\n")
                            print("--- Generated Q&A Pair ---")
                            print(qa_text)

        print(f"\\nSuccessfully saved all Q&A pairs to {output_file}")

    except Exception as e:
        print(f"An error occurred during processing or saving the output file: {e}")


In [None]:
if __name__ == "__main__":
    """
    Main execution block. This code runs when the script is executed directly
    from the command line or a Jupyter cell.
    """
    print("--- Starting Q&A Generation Script ---")
    print(f"Input directory: {INPUT_DIRECTORY}")
    print(f"Output file: {OUTPUT_FILE}")
    print(f"Using model: {OLLAMA_MODEL}")

    # Run the main processing function with the configured parameters.
    process_documents(
        input_dir=INPUT_DIRECTORY,
        output_file=OUTPUT_FILE,
        ollama_model=OLLAMA_MODEL
    )

    print("\\n--- Script finished ---")