# Synthetic RAG eval data

In [3]:
!pip install -qU GitPython langchain langchain-openai llama-index ragas ratelimit tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import tempfile
from getpass import getpass

from git import Repo
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.testset import TestsetGenerator
from ratelimit import limits, sleep_and_retry

from lib.rag.parse_notebooks import read_notebooks

In [8]:
# https://platform.openai.com/api-keys
OPENAI_API_KEY = getpass("OpenAI API key: ")

In [9]:
class ThrottlingLangchainLLM(LangchainLLMWrapper):
    @sleep_and_retry
    @limits(calls=3, period=60)
    def generate_text(
        self,
        prompts,
        n=1,
        temperature=1e-8,
        callbacks=None,
    ):
        print(f"generate_text with {len(prompts)} prompts, n={n}")
        return super().generate(prompts, n, temperature, callbacks)

In [10]:
# Add custom llms and embeddings
chatgpt = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
generator_llm = ThrottlingLangchainLLM(llm=chatgpt)
critic_llm = ThrottlingLangchainLLM(llm=chatgpt)
embeddings_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Change resulting question type distribution
testset_distribution = {
    "simple": 0.25,
    "reasoning": 0.5,
    "multi_context": 0.0,
    "conditional": 0.25,
}

# percentage of conversational question
chat_qa = 0.2

test_generator = TestsetGenerator(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings_model=embeddings_model,
    testset_distribution=testset_distribution,
    chat_qa=chat_qa,
)


def generate_test_dataset(documents, num_test_samples):
    return test_generator.generate(documents, num_test_samples)

TypeError: LangchainLLMWrapper.__init__() got an unexpected keyword argument 'llm'

In [12]:
REPO_URL = "https://github.com/elastic/elasticsearch-labs.git"

repo_dir = tempfile.mkdtemp()
repo = Repo.clone_from(REPO_URL, repo_dir, depth=1)

In [14]:
notebooks = read_notebooks(REPO_URL, repo_dir)
notebooks[:3]

[Document(page_content='\n**Blog: Plagiarism detection with Elasticsearch**\n\n```python\n!pip install elasticsearch==8.11 #Elasticsearch\n```\n\n```python\npip -q install eland elasticsearch sentence_transformers transformers torch==2.1.0\n```\n\n```python\nfrom elasticsearch import Elasticsearch, helpers\nfrom elasticsearch.client import MlClient\nfrom eland.ml.pytorch import PyTorchModel\nfrom eland.ml.pytorch.transformers import TransformerModel\nfrom urllib.request import urlopen\nimport json\nfrom pathlib import Path\nimport getpass\n```\n\n```python\n# Found in the \'Manage Deployment\' page\nCLOUD_ID = getpass.getpass("Enter Elastic Cloud ID:  ")\n\n# Password for the \'elastic\' user generated by Elasticsearch\nELASTIC_PASSWORD = getpass.getpass("Enter Elastic password:  ")\n\n# Create the client instance\nclient = Elasticsearch(\n    cloud_id=CLOUD_ID, basic_auth=("elastic", ELASTIC_PASSWORD), request_timeout=3600\n)\n```\n\n```python\n# Set the model name from Hugging Face a

In [31]:
nb_testset = generate_test_dataset(notebooks[:10], 20)
nb_testset















[A

TestDataset(test_data=[DataRow(question='What does the `reindex` function in the code serve? If successful, what fields will the `dest` index have?', ground_truth_context=['Now we can reindex data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` with the ingest pipeline `ecommerce-pipeline` we created.\nAfter this step our `dest` index will have the fields we need to perform Semantic Search.'], ground_truth=['The `reindex` function in the code serves to transfer data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` using the ingest pipeline `ecommerce-pipeline`. If successful, the `dest` index will have the fields required to perform Semantic Search.'], question_type='conditional', episode_done=True), DataRow(question='What are the model ID and task type for loading the Hugging Face model into Elasticsearch?', ground_truth_context=['When calling `TransformerModel` you specify the HF model id and the task type.\nYou can try specifying 

In [34]:
for data in nb_testset.test_data:
    print(data.question)
    print(data.ground_truth)
    print(data.ground_truth_context)
    print()

What does the `reindex` function in the code serve? If successful, what fields will the `dest` index have?
['The `reindex` function in the code serves to transfer data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` using the ingest pipeline `ecommerce-pipeline`. If successful, the `dest` index will have the fields required to perform Semantic Search.']
['Now we can reindex data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` with the ingest pipeline `ecommerce-pipeline` we created.\nAfter this step our `dest` index will have the fields we need to perform Semantic Search.']

What are the model ID and task type for loading the Hugging Face model into Elasticsearch?
['The model ID for loading the Hugging Face model into Elasticsearch is \'sentence-transformers/all-distilroberta-v1\' and the task type is "text_embedding".']
['When calling `TransformerModel` you specify the HF model id and the task type.\nYou can try specifying `auto` a