# Create a golden dataset using RAGAS

In [1]:
import os
import getpass
from dotenv import load_dotenv
import getpass


In [2]:

load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"] = "true"

def set_api_key_if_not_present(key_name, prompt_message):
    if key_name not in os.environ or not os.environ[key_name]:
        os.environ[key_name] = getpass.getpass(prompt_message)

set_api_key_if_not_present("OPENAI_API_KEY", "OpenAI API Key:")
set_api_key_if_not_present("TAVILY_API_KEY", "TAVILY_API_KEY:")
set_api_key_if_not_present("LANGCHAIN_API_KEY", "LANGCHAIN_API_KEY:")

## Data loading

First, we're going to load all of our transcripts in.

In [3]:
from langchain_core.documents import Document


In [4]:
from ast import Dict
import loader
import requests
from typing import List


In [22]:

from typing import Tuple


url = "https://huggingface.co/datasets/mbudisic/PsTuts-VQA/raw/main/train.json"

def load_VQA_file_from_url(url:str) -> Tuple[List[Document], str, List[Dict]]:
    """
    Loads a VQA dataset file from a URL and processes it into documents.
    
    Args:
        url (str): URL pointing to a JSON file containing VQA dataset
        
    Returns:
        tuple: A tuple containing:
            - list[Document]: Processed documents from the VideoTranscriptBulkLoader
            - str: Group name extracted from the URL filename
            - List[Dict]: The raw JSON payload loaded from the URL
            
    Note:
        This function needs to be updated as it currently has a type mismatch.
        The return type annotation indicates List[Document] but it returns a tuple.
    """
    resp = requests.get(url)
    resp.raise_for_status()
    group = url.split('/')[-1].split('.')[0]
    json_payload = loader.load_json_string(resp.content.decode('utf-8'), group)
    docs = loader.VideoTranscriptBulkLoader(json_payload=json_payload).load()
    return docs, group, json_payload


train,group_name,json_payload = load_VQA_file_from_url(url)


The following are all data keys. `group` indicates the filename the transcript was loaded from.


In [23]:
print(f"Number of files: {len(json_payload)}")
print(f"File data fields: {json_payload[0].keys()}" )


Number of files: 54
File data fields: dict_keys(['video_id', 'title', 'desc', 'length', 'url', 'transcripts', 'qa', 'group'])


In each file, `transcripts` field is a list of transcript chunks

In [24]:
print(f"Transcripts in first file: {len(json_payload[0]['transcripts'])}")
print(f"Transcript keys: {json_payload[0]['transcripts'][0].keys()}")

Transcripts in first file: 111
Transcript keys: dict_keys(['sent_id', 'sent', 'begin', 'end'])


Above, we used the loader for 1 document-per-video (as opposed to 1 document
per phrase, which is the granular form of transcripts
)


In [25]:
print(f"# of documents: {len(train)}. # of videos: {len(json_payload)}")

# of documents: 54. # of videos: 54


## Building the knowledge graph

For the sake of the notebook, we'll demo here how to create a knowledge graph.

At the end, we will call all of this from a function in `golden_dataset.py` to automate
creation of the knowledge graph and queries from an arbitrary collection of documents.


First, the 54 documents is substantial for our needs. We can trim it down to about half.

In [26]:
docs = train
len(docs)

54

In [27]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.embeddings import Embeddings
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType
from ragas.testset.transforms import default_transforms, apply_transforms

from ragas.testset import TestsetGenerator
from ragas.testset.persona import Persona
from ragas.testset.synthesizers import default_query_distribution, SingleHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer, MultiHopSpecificQuerySynthesizer


We're going to generate the knowledge graph and queries using `gpt-4.1` as the
generating model, and `text-embedding-3-small` as the embedding model`.
This is a rare enough call that we can afford calling more powerful models.

In [28]:
generator_llm = ChatOpenAI(model="gpt-4.1")
embedding_model:Embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

print(embedding_model.model)

wrapped_generator_llm = LangchainLLMWrapper(generator_llm)
wrapped_embedding_model = LangchainEmbeddingsWrapper(embedding_model)

text-embedding-3-small


In [29]:
from pathlib import Path
root = Path(".")
kg_filename = Path(f"kg_{group_name}_text_embed.json")
kg_path = root.joinpath(kg_filename)
print(kg_path)

kg_train_text_embed.json


In [30]:
from email import generator


kg = KnowledgeGraph()

try:
    kg = kg.load(kg_path)
    print(f"Loaded from {kg_path}.")
except FileNotFoundError:
    print(f"{kg_path} does not contain a knowledge graph. Generating.")
    for doc in docs:
        kg.nodes.append(
            Node(
                type=NodeType.DOCUMENT,
                properties={"page_content": doc.page_content, 
                            "document_metadata": doc.metadata}
            )
        )
    print(f"Initial size {str(kg)}")
    transforms = default_transforms(documents=docs, 
                                            llm=wrapped_generator_llm, 
                                            embedding_model=wrapped_embedding_model)
    apply_transforms(kg, transforms)
    print(f"After transformations size {str(kg)}")
    kg.save(kg_path)
    print(f"Saved to {kg_path}.")
    
kg

kg_train_text_embed.json does not contain a knowledge graph. Generating.
Initial size KnowledgeGraph(nodes: 54, relationships: 0)


Applying HeadlinesExtractor:   0%|          | 0/51 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/54 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/96 [00:00<?, ?it/s]

Property 'summary' already exists in node '15cabb'. Skipping!
Property 'summary' already exists in node '29021d'. Skipping!
Property 'summary' already exists in node 'e6751c'. Skipping!
Property 'summary' already exists in node '249d07'. Skipping!
Property 'summary' already exists in node '109999'. Skipping!
Property 'summary' already exists in node '94fbc3'. Skipping!
Property 'summary' already exists in node '7f504f'. Skipping!
Property 'summary' already exists in node '90f441'. Skipping!
Property 'summary' already exists in node '2c0f75'. Skipping!
Property 'summary' already exists in node 'f96757'. Skipping!
Property 'summary' already exists in node '72482d'. Skipping!
Property 'summary' already exists in node '16476a'. Skipping!
Property 'summary' already exists in node 'b79fa9'. Skipping!
unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-4OzneGxZmntU4Fq9wJeB1LE6 on tokens per min (TPM): Limit 30000, Used 29

Applying CustomNodeFilter:   0%|          | 0/12 [00:00<?, ?it/s]

Node 54211285-5248-40b9-93f9-997ad2ea0592 does not have a summary. Skipping filtering.
Node 08a19371-c407-479e-946d-40a1a87fbc4e does not have a summary. Skipping filtering.
Node bd0631e5-c78c-4bcb-80e7-d778e3fdf325 does not have a summary. Skipping filtering.
Node 5c82fe5c-472b-4dfd-a944-56f69e938b57 does not have a summary. Skipping filtering.


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/118 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '15cabb'. Skipping!
Property 'summary_embedding' already exists in node '29021d'. Skipping!
Property 'summary_embedding' already exists in node '0c4fa2'. Skipping!
Property 'summary_embedding' already exists in node '249d07'. Skipping!
Property 'summary_embedding' already exists in node 'e6751c'. Skipping!
Property 'summary_embedding' already exists in node 'f96757'. Skipping!
Property 'summary_embedding' already exists in node '109999'. Skipping!
Property 'summary_embedding' already exists in node '94fbc3'. Skipping!
Property 'summary_embedding' already exists in node '2c0f75'. Skipping!
Property 'summary_embedding' already exists in node '7f504f'. Skipping!
Property 'summary_embedding' already exists in node '90f441'. Skipping!
Property 'summary_embedding' already exists in node '12e4ea'. Skipping!
Property 'summary_embedding' already exists in node '8125fc'. Skipping!
Property 'summary_embedding' already exists in node '72482d'. Sk

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

unable to apply transformation: Node c4c71ba9-552c-48c4-8f58-5317779830cc has no summary_embedding


After transformations size KnowledgeGraph(nodes: 110, relationships: 27)
Saved to kg_train_text_embed.json.


KnowledgeGraph(nodes: 110, relationships: 27)

## Test set generator

In [31]:
personas = [
    Persona(
    name="Beginner Photoshop User",
    role_description=("Beginner Photoshop user, learning to complete "
                      "simple tasks, use the tools in Photoshop "
                      "and navigate the graphical user interface"),
),
    Persona(
    name="Photoshop trainer",
    role_description=("Experienced trainer in Photoshop. Looking to develop"
                      "step-by-step guides for Photoshop beginners"),
)
]

generator = TestsetGenerator(llm=wrapped_generator_llm, 
                             embedding_model=wrapped_embedding_model, 
                             persona_list=personas,
                             knowledge_graph=kg)

In [32]:

query_distribution = [
        (SingleHopSpecificQuerySynthesizer(llm=wrapped_generator_llm), 0.5),
        (MultiHopAbstractQuerySynthesizer(llm=wrapped_generator_llm), 0.25),
        (MultiHopSpecificQuerySynthesizer(llm=wrapped_generator_llm), 0.25),
]


In [33]:

# testset = generator.generate(
#     testset_size=100, 
#     batch_size=8,
#     num_personas=len(personas),
#     query_distribution=query_distribution)


In [34]:
import create_golden_dataset

create_golden_dataset.create_golden_dataset


<function create_golden_dataset.create_golden_dataset(docs: List[langchain_core.documents.base.Document], testset_size, group_name: str = '', filename: str = '', generator_llm=ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x7af5b5524dd0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x7af5b552a250>, root_client=<openai.OpenAI object at 0x7af5b5526690>, root_async_client=<openai.AsyncOpenAI object at 0x7af5b5525090>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********')), embedding_model: langchain_core.embeddings.embeddings.Embeddings = OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7af5b552d6d0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7af5b552e250>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, 

In [35]:
test_docs,_,_ = load_VQA_file_from_url("https://huggingface.co/datasets/mbudisic/PsTuts-VQA/raw/main/test.json")

test_dataset= create_golden_dataset.create_golden_dataset(
    test_docs,
    20,
    group_name = "test",
    generator_llm=wrapped_generator_llm,
    embedding_model=wrapped_embedding_model
)


kg_test.json
Initial size KnowledgeGraph(nodes: 11, relationships: 0)


Applying HeadlinesExtractor:   0%|          | 0/10 [00:00<?, ?it/s]

unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'


Applying HeadlineSplitter:   0%|          | 0/11 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/10 [00:00<?, ?it/s]

unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'
unable to apply transformation: 'LangchainLLMWrapper' object has no attribute 'agenerate_prompt'


Applying CustomNodeFilter: 0it [00:00, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/10 [00:00<?, ?it/s]

unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

unable to apply transformation: Node ecdd3922-3b08-4652-aafd-46e56ed225af has no summary_embedding


After transformations size KnowledgeGraph(nodes: 11, relationships: 0)
TestsetGenerator(llm=LangchainLLMWrapper(langchain_llm=LangchainLLMWrapper(...)), embedding_model=LangchainEmbeddingsWrapper(embeddings=LangchainEmbeddingsWrapper(...)), knowledge_graph=KnowledgeGraph(nodes: 11, relationships: 0), persona_list=[Persona(name='Beginner Photoshop User', role_description='Beginner Photoshop user, learning to complete simple tasks, use the tools in Photoshop and navigate the graphical user interface'), Persona(name='Photoshop trainer', role_description='Experienced trainer in Photoshop. Looking to developstep-by-step guides for Photoshop beginners')])


Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Batch 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

ValueError: No clusters found in the knowledge graph. Try changing the relationship condition.

In [None]:
# from huggingface_hub import login
# login()
# ragas_dataset = testset.to_hf_dataset()
# ragas_dataset.push_to_hub("mbudisic/pstuts_rag_qa")