# Testset Generation
The purpose of this script is to generate a testset on a set of documents or document which can then be used to benchmark the performance of the SLMs locally.

In [3]:
!pip install PyMuPDF
!pip install pymupdf



In [1]:
# import libraries required
from __future__ import annotations

import argparse
import json
import os
import fitz
from typing import List

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper, BaseRagasLLM
from ragas.embeddings import LangchainEmbeddingsWrapper, BaseRagasEmbeddings
from ragas.testset import TestsetGenerator
from ragas.testset.graph import KnowledgeGraph, Node
from ragas.testset.synthesizers import QueryDistribution
from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer
from ragas.testset.synthesizers.multi_hop import (
    MultiHopAbstractQuerySynthesizer,
    MultiHopSpecificQuerySynthesizer,
)
from ragas.testset.persona import Persona


# Import graph construction components
from ragas.testset.transforms import (
    apply_transforms,
    Parallel,
    KeyphrasesExtractor,
    SummaryExtractor,
    EmbeddingExtractor,
)
from ragas.testset.transforms.extractors import NERExtractor
from ragas.testset.transforms.relationship_builders.cosine import (
    SummaryCosineSimilarityBuilder,
)

ModuleNotFoundError: No module named 'fitz'

In [None]:
# load the documents
def load_documents(path: str) -> List:
    """Load documents from a path (file or directory)."""
    if os.path.isdir(path):
        # Load all PDFs / MD / txt recursively
        loader = DirectoryLoader(path, glob="**/*.*")
    elif path.lower().endswith(".pdf"):
        loader = PyPDFLoader(path)
    else:
        loader = TextLoader(path)
    return loader.load()

In [None]:
# load the documents
file_path = "data/sts-student-handbook.pdf"
print(f"Loading documents from {file_path} …")
raw_docs = load_documents(file_path)
raw_docs

### Semantic Chunking Strategy for the docs instead

In [None]:
import re
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List

def chunk_documents(docs: List[Document]) -> List[Document]:
    """
    Chunks documents based on semantic headings rather than fixed size.
    It assumes headings are numbered (e.g., "1. Introduction", "2.1 Key Policies")
    and uses these as boundaries for creating meaningful chunks.

    If no numbered headings are found, it falls back to the original fixed-size
    chunking method.
    """
    print("Attempting to chunk documents by semantic headings...")

    # 1. Combine all page content into a single string to handle sections
    # that span across pages.
    full_text = "\n\n".join(doc.page_content for doc in docs)

    # 2. Define a regex pattern to find numbered headings (e.g., 1. , 1.1 , 1.1.1)
    # This pattern looks for lines starting with digits, followed by a dot, a space,
    # and then the title.
    heading_pattern = re.compile(r"^\d+(\.\d+)*\s.*$", re.MULTILINE)

    # Find the start index of all headings
    heading_indices = [match.start() for match in heading_pattern.finditer(full_text)]

    # 3. If no headings are found, fall back to the original method.
    if not heading_indices:
        print("WARN: No numbered headings found. Falling back to fixed-size chunking.")
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        return splitter.split_documents(docs)

    # 4. Create chunks based on the text between headings.
    semantic_chunks = []
    for i in range(len(heading_indices)):
        # The start of the chunk is the start of the current heading
        start_index = heading_indices[i]
        # The end of the chunk is the start of the next heading, or the end of the document
        end_index = heading_indices[i+1] if i + 1 < len(heading_indices) else len(full_text)

        chunk_text = full_text[start_index:end_index].strip()

        if chunk_text:
            # Create a new Document for each semantic chunk.
            # We lose the specific page number but gain immense semantic value.
            # The source metadata is preserved from the first document.
            semantic_chunks.append(Document(
                page_content=chunk_text,
                metadata={"source": docs[0].metadata.get("source", "Unknown")}
            ))

    print(f"Successfully created {len(semantic_chunks)} semantic chunks.")
    return semantic_chunks


In [None]:
docs = chunk_documents(raw_docs)
print(f"Chunked to {len(docs)} passages (≈{CHUNK_SIZE} chars each)")

In [None]:
docs

### New Chunking Strategy
We are going to try something different here, where the goal will be to convert the PDF into clear markdown format
in order to get much better retrieval

In [None]:
# Cell 2: Better PDF Text Extraction
pdf_path = 'data/sts-student-handbook.pdf'
full_text = ""

# Open the PDF file
with fitz.open(pdf_path) as doc:
    # Iterate through each page and extract its text
    for page in doc:
        full_text += page.get_text() + "\n\n" # Add page breaks for clarity

print("Successfully extracted text from PDF.")
print(f"First 500 characters:\n---\n{full_text[:500]}")

In [None]:
# variables
# generator_model = "gpt-3.5-turbo-16k"
# generator_model = "gpt-4.1-mini"
generator_model = "gpt-4o"

# Wrap OpenAI models for Ragas
generator_llm = LangchainLLMWrapper(ChatOpenAI(model=generator_model, temperature=0))
embedding_model = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# --- Define Knowledge Graph Transformations ---
print("Defining Knowledge Graph transformations…")
transforms = [
    Parallel(
        KeyphrasesExtractor(llm=generator_llm),
        SummaryExtractor(llm=generator_llm),
        NERExtractor(llm=generator_llm),
    ),
    EmbeddingExtractor(
        embedding_model=embedding_model,
        embed_property_name="summary",  # Embed summaries, not full content
        property_name="summary_embedding",  # Store embeddings in this property
    ),
    SummaryCosineSimilarityBuilder(
        threshold=0.7  # Lowered threshold to encourage relationship formation
    ),
]

In [None]:
# --- Testset Generation ---

# Create UCL student personas
persona_new_student = Persona(
    name="New Student",
    role_description="A first-year student looking for information about university policies and procedures."
)
persona_international = Persona(
    name="International Student",
    role_description="An international student seeking information about visa requirements, accommodation, and support services."
)
persona_graduate = Persona(
    name="Graduate Student",
    role_description="A graduate student interested in research opportunities, funding, and academic progression."
)

personas = [persona_new_student, persona_international, persona_graduate]

In [None]:
generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=embedding_model,
    persona_list=personas,
)

In [None]:
def custom_query_distribution(
    llm: BaseRagasLLM, force_single_hop: bool = False
) -> QueryDistribution:
    """Create a custom query distribution with a mix of question types."""
    single_hop_specific = SingleHopSpecificQuerySynthesizer(llm=llm)

    if not force_single_hop:
        multi_hop_specific = MultiHopSpecificQuerySynthesizer(llm=llm)
        multi_hop_abstract = MultiHopAbstractQuerySynthesizer(llm=llm)
        return [
            (single_hop_specific, 0.5),
            (multi_hop_specific, 0.25),
            (multi_hop_abstract, 0.25),
        ]
    else:
        return [(single_hop_specific, 1.0)]

# Define ideal and fallback distributions
ideal_distribution = custom_query_distribution(generator_llm)
fallback_distribution = custom_query_distribution(generator_llm, force_single_hop=True)

print("Generating test-set… this may take a few minutes.")
try:
    print("Attempting to generate test set with multi-hop questions...")
    dataset = generator.generate_with_langchain_docs(
        documents=docs,
        testset_size=10,
        transforms=transforms,
        query_distribution=ideal_distribution,
    )
except ValueError as e:
    if "No clusters found" in str(e):
        print("WARN: Could not generate multi-hop questions. Falling back to single-hop only.")
        dataset = generator.generate_with_langchain_docs(
            documents=docs,
            testset_size=10,
            transforms=transforms,
            query_distribution=fallback_distribution,
        )
    else:
        # Re-raise any other unexpected errors
        raise e

In [None]:
out_dir = "testset"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f"testset_gpt-4o.json")

with open(out_path, "w") as f:
    json.dump(dataset.to_list(), f, indent=2)
print(f"Saved {len(dataset.to_list())} samples to {out_path}")