# Create a golden dataset using RAGAS

In [127]:
import os
import getpass
from dotenv import load_dotenv
import getpass


In [128]:

load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"] = "true"

def set_api_key_if_not_present(key_name, prompt_message):
    if key_name not in os.environ or not os.environ[key_name]:
        os.environ[key_name] = getpass.getpass(prompt_message)

set_api_key_if_not_present("OPENAI_API_KEY", "OpenAI API Key:")
set_api_key_if_not_present("TAVILY_API_KEY", "TAVILY_API_KEY:")
set_api_key_if_not_present("LANGCHAIN_API_KEY", "LANGCHAIN_API_KEY:")

## Data loading

First, we're going to load all of our transcripts in.

In [145]:
from ast import Dict
import json

from pstuts_rag.loader import load_json_files
filenames = ["../data/test.json","../data/dev.json", "../data/train.json"]
filenames = ["../data/test.json","../data/dev.json"]

from typing import List, Dict, Any
data:List[Dict[str,Any]] = await load_json_files(filenames)

from pathlib import Path

data_name = "pstuts_transcripts_"+"_".join(Path(path).stem for path in filenames )
    
print(data_name)



pstuts_transcripts_test_dev


The following are all data keys. `group` indicates the filename the transcript was loaded from.


In [146]:
print(f"Number of files: {len(data)}")
print(f"File data fields: {data[0].keys()}" )


Number of files: 22
File data fields: dict_keys(['video_id', 'title', 'desc', 'length', 'url', 'transcripts', 'qa', 'group'])


In each file, `transcripts` field is a list of transcript chunks

In [147]:
print(f"Transcripts in first file: {len(data[0]["transcripts"])}")
print(f"Transcript keys: {data[0]["transcripts"][0].keys()}" )

Transcripts in first file: 58
Transcript keys: dict_keys(['sent_id', 'sent', 'begin', 'end'])


Now, we will load the documents from transcripts.

In this case, we are going to use the loader that loads 1 document per video.
(As opposed to the `VideoTranscriptChunkLoader` that loads 1 doc per chunk)

In [148]:
from pstuts_rag.loader import VideoTranscriptBulkLoader


loader = VideoTranscriptBulkLoader(json_payload=data)
docs = loader.load()

print(f"# of documents: {len(docs)}. # of videos: {len(data)}")

# of documents: 22. # of videos: 22


## Building the knowledge graph

In [149]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType
from ragas.testset.transforms import default_transforms, apply_transforms


In [150]:
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

transformer_llm = generator_llm
embedding_model = generator_embeddings


In [151]:
root = Path("../data")
kg_filename = Path(f"kg_{data_name}.json")
kg_path = root.joinpath(kg_filename)

In [152]:
kg = KnowledgeGraph()

try:
    kg = kg.load(kg_path)
    print(f"Loaded from {kg_path}.")
except:
    print(f"{kg_path} does not contain a knowledge graph. Generating.")
    for doc in docs:
        kg.nodes.append(
            Node(
                type=NodeType.DOCUMENT,
                properties={"page_content": doc.page_content, 
                            "document_metadata": doc.metadata}
            )
        )
    print(f"Initial size {str(kg)}")
    default_transforms = default_transforms(documents=docs, 
                                            llm=transformer_llm, 
                                            embedding_model=embedding_model)
    apply_transforms(kg, default_transforms)
    print(f"After transformations size {str(kg)}")
    kg.save(kg_path)
    print(f"Saved to {kg_path}.")
    
kg

../data/kg_pstuts_transcripts_test_dev.json does not contain a knowledge graph. Generating.
Initial size KnowledgeGraph(nodes: 22, relationships: 0)


Applying HeadlinesExtractor:   0%|          | 0/21 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/22 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/37 [00:00<?, ?it/s]

Property 'summary' already exists in node '9c97f4'. Skipping!
Property 'summary' already exists in node '8a8e28'. Skipping!
Property 'summary' already exists in node 'dcd67c'. Skipping!
Property 'summary' already exists in node '876b13'. Skipping!
Property 'summary' already exists in node '8898c4'. Skipping!
Property 'summary' already exists in node '3f46e3'. Skipping!
Property 'summary' already exists in node '118e6c'. Skipping!
Property 'summary' already exists in node '9c28dd'. Skipping!
Property 'summary' already exists in node 'd9cbc0'. Skipping!
Property 'summary' already exists in node 'a0f2e0'. Skipping!
Property 'summary' already exists in node '8679a1'. Skipping!
Property 'summary' already exists in node '5403e1'. Skipping!
Property 'summary' already exists in node 'a71e28'. Skipping!
Property 'summary' already exists in node '767485'. Skipping!
Property 'summary' already exists in node '2b676d'. Skipping!
Property 'summary' already exists in node '4dd9a1'. Skipping!


Applying CustomNodeFilter:   0%|          | 0/12 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/57 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '9c97f4'. Skipping!
Property 'summary_embedding' already exists in node 'dcd67c'. Skipping!
Property 'summary_embedding' already exists in node '876b13'. Skipping!
Property 'summary_embedding' already exists in node 'a0f2e0'. Skipping!
Property 'summary_embedding' already exists in node 'd9cbc0'. Skipping!
Property 'summary_embedding' already exists in node '118e6c'. Skipping!
Property 'summary_embedding' already exists in node '8898c4'. Skipping!
Property 'summary_embedding' already exists in node '8679a1'. Skipping!
Property 'summary_embedding' already exists in node '8a8e28'. Skipping!
Property 'summary_embedding' already exists in node '3f46e3'. Skipping!
Property 'summary_embedding' already exists in node '9c28dd'. Skipping!
Property 'summary_embedding' already exists in node '5403e1'. Skipping!
Property 'summary_embedding' already exists in node 'a71e28'. Skipping!
Property 'summary_embedding' already exists in node '767485'. Sk

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

After transformations size KnowledgeGraph(nodes: 48, relationships: 695)
Saved to ../data/kg_pstuts_transcripts_test_dev.json.


KnowledgeGraph(nodes: 48, relationships: 695)

## Test set generator

In [153]:
from ragas.testset import TestsetGenerator

personas = [
    Persona(
    name="Beginner Photoshop User",
    role_description=("Beginner Photoshop user, learning to complete "
                      "simple tasks, use the tools in Photoshop "
                      "and navigate the graphical user interface"),
),
    Persona(
    name="Photoshop trainer",
    role_description=("Experienced trainer in Photoshop. Looking to develop"
                      "step-by-step guides for Photoshop beginners"),
)
]

generator = TestsetGenerator(llm=generator_llm, 
                             embedding_model=embedding_model, 
                             persona_list=personas,
                             knowledge_graph=kg)

In [154]:
from ragas.testset.synthesizers import default_query_distribution, SingleHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer, MultiHopSpecificQuerySynthesizer
from ragas.testset.persona import Persona
query_distribution = [
        (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 0.8),
        (MultiHopAbstractQuerySynthesizer(llm=generator_llm), 0.1),
        (MultiHopSpecificQuerySynthesizer(llm=generator_llm), 0.1),
]


In [159]:

testset = generator.generate(
    testset_size=100, 
    batch_size=8,
    num_personas=len(personas),
    query_distribution=query_distribution)


Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Batch 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/100 [00:00<?, ?it/s]

Batch 1/13:   0%|          | 0/8 [00:00<?, ?it/s]

In [160]:
testset.to_pandas()[["user_input", "reference"]]

Unnamed: 0,user_input,reference
0,How I can use Move tool to move many layers at...,If you have the Move tool selected in Photosho...
1,How I can use Windows key for select layers in...,"In Photoshop, when selecting layers to put int..."
2,How I select layers in Windows for group in Ph...,"In Windows, to select layers for grouping in P..."
3,how i make group in adobe photoshop to reduce ...,"In Adobe Photoshop, you can reduce clutter in ..."
4,Wut is Group 1 in Photoshop and how do I use it?,Group 1 in Photoshop is a folder created by cl...
...,...,...
95,How do you use the Rectangular Marquee tool to...,To use the Rectangular Marquee tool to make a ...
96,How do you use the Select menu to deselect a s...,After making a selection with the Rectangular ...
97,"In Photoshop CC, how can a beginner add extra ...","In Photoshop CC, to add extra canvas space to ..."
98,How can I add extra pixels to just one side of...,To add extra pixels to just one side of an ima...


In [161]:
testset.upload()

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/94d276d0-95aa-4305-839c-c846af514e2e


'https://app.ragas.io/dashboard/alignment/testset/94d276d0-95aa-4305-839c-c846af514e2e'

In [162]:
kg

KnowledgeGraph(nodes: 48, relationships: 695)

In [165]:
from huggingface_hub import login
login()
ragas_dataset = testset.to_hf_dataset()
# ragas_dataset.push_to_hub("mbudisic/pstuts_rag_qa")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mbudisic/pstuts_rag_qa/commit/6b0d6bb3d96c1d26e5678fd1b781401047ea5d92', commit_message='Upload dataset', commit_description='', oid='6b0d6bb3d96c1d26e5678fd1b781401047ea5d92', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mbudisic/pstuts_rag_qa', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mbudisic/pstuts_rag_qa'), pr_revision=None, pr_num=None)