In [1]:
# !pip install -U -q langchain openai ragas arxiv pymupdf chromadb wandb tiktoken

Evaluation with RAGAS and Advanced Retrieval Methods Using LangChain
API connection

In [2]:
import os
import openai
from getpass import getpass

# openai.api_key = getpass("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")
os.environ["OPENAI_API_KEY"] = "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"

Data Collection
use papers from Arxiv with the ArxivLoader document loader from LangChain.

Let's grab and load 5 documents.

In [3]:
from langchain.document_loaders import ArxivLoader

base_docs = ArxivLoader(query="Retrieval Augmented Generation", load_max_docs=5).load()
len(base_docs)

5

In [4]:
base_docs

[Document(page_content='A Survey on Retrieval-Augmented Text Generation\nHuayang Li♥,∗\nYixuan Su♠,∗\nDeng Cai♦,∗\nYan Wang♣,∗\nLemao Liu♣,∗\n♥Nara Institute of Science and Technology\n♠University of Cambridge\n♦The Chinese University of Hong Kong\n♣Tencent AI Lab\nli.huayang.lh6@is.naist.jp, ys484@cam.ac.uk\nthisisjcykcd@gmail.com, brandenwang@tencent.com\nlemaoliu@gmail.com\nAbstract\nRecently, retrieval-augmented text generation\nattracted increasing attention of the compu-\ntational linguistics community.\nCompared\nwith conventional generation models, retrieval-\naugmented text generation has remarkable ad-\nvantages and particularly has achieved state-of-\nthe-art performance in many NLP tasks. This\npaper aims to conduct a survey about retrieval-\naugmented text generation. It ﬁrstly highlights\nthe generic paradigm of retrieval-augmented\ngeneration, and then it reviews notable ap-\nproaches according to different tasks including\ndialogue response generation, machine trans-\nl

Creating an Index
use a naive index creation strategy of just using RecursiveCharacterTextSplitter on the documents and embedding each into the VectorStore using OpenAIEmbeddings().

In [5]:


from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Instantiate OpenAIEmbeddings 
openai_embeddings = OpenAIEmbeddings(openai_api_key='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')

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250)

# Split the documents 
docs = text_splitter.split_documents(base_docs)

# Create the Chroma vectorstore with the documents and the OpenAI embeddings instance
vectorstore = Chroma.from_documents(docs, openai_embeddings)
vectorstore

  warn_deprecated(


<langchain_community.vectorstores.chroma.Chroma at 0x7fe140bbfc10>

In [6]:
len(docs)

4756

In [7]:
print(max([len(chunk.page_content) for chunk in docs]))

249


convert our Chroma vectorstore into a retriever with the .as_retriever() method.

In [8]:
base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 2})

to give it a test!

In [9]:
relevant_docs = base_retriever.get_relevant_documents("What is Retrieval Augmented Generation?")

In [10]:
len(relevant_docs)

2

Creating a Retrieval Augmented Generation Prompt
we can set up a prompt template that will be used to provide the LLM with the necessary contexts, user query, and instructions!

In [11]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':\n\n### CONTEXT\n{context}\n\n### QUESTION\nQuestion: {question}\n"))])

Setting Up the Basic QA Chain
instantiate the basic RAG chain!

In [12]:

from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

api_key = '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'

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)



retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

  warn_deprecated(


test it out!

In [13]:
question = "What is RAG?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result)

{'response': AIMessage(content='RAG stands for Retrieval Augmented Generation.'), 'context': [Document(page_content='2020). RAG consists of three primary components:\nTool Retrieval, Plan Generation, and Execution.1\nIn this study, we focus on enhancing tool retrieval,\nwith the goal of achieving subsequent improve-\nments in plan generation.', metadata={'Authors': 'Raviteja Anantha, Tharun Bethi, Danil Vodianik, Srinivas Chappidi', 'Published': '2023-12-09', 'Summary': "Large language models (LLMs) have the remarkable ability to solve new tasks\nwith just a few examples, but they need access to the right tools. Retrieval\nAugmented Generation (RAG) addresses this problem by retrieving a list of\nrelevant tools for a given task. However, RAG's tool retrieval step requires\nall the required information to be explicitly present in the query. This is a\nlimitation, as semantic search, the widely adopted tool retrieval method, can\nfail when the query is incomplete or lacks context. To add

Ground Truth Dataset Creation Using GPT-3.5-turbo and GPT-4
the evaluation dataset is provided.

The basic idea is that we can use LangChain to create questions based on our contexts, and then answer those questions.

Let's look at how that works in the code!

In [14]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [15]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [16]:
from langchain.chat_models import ChatOpenAI

# Replace with your actual OpenAI API key
api_key = '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'

question_generation_llm = ChatOpenAI(model="gpt-3.5-turbo-16k", openai_api_key=api_key)
bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [17]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a University Professor creating a test for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=docs[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [18]:
for k, v in output_dict.items():
  print(k)
  print(v)

question
What is the main focus of this paper?


In [19]:
!pip install -q -U tqdm

In [20]:
from tqdm import tqdm

qac_triples = []

for text in tqdm(docs[:10]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

100%|██████████| 10/10 [00:14<00:00,  1.41s/it]


In [21]:
qac_triples[5]

{'question': 'What is the focus of this paper?',
 'context': Document(page_content='thisisjcykcd@gmail.com, brandenwang@tencent.com\nlemaoliu@gmail.com\nAbstract\nRecently, retrieval-augmented text generation\nattracted increasing attention of the compu-\ntational linguistics community.\nCompared', metadata={'Published': '2022-02-13', 'Title': 'A Survey on Retrieval-Augmented Text Generation', 'Authors': 'Huayang Li, Yixuan Su, Deng Cai, Yan Wang, Lemao Liu', 'Summary': 'Recently, retrieval-augmented text generation attracted increasing attention\nof the computational linguistics community. Compared with conventional\ngeneration models, retrieval-augmented text generation has remarkable\nadvantages and particularly has achieved state-of-the-art performance in many\nNLP tasks. This paper aims to conduct a survey about retrieval-augmented text\ngeneration. It firstly highlights the generic paradigm of retrieval-augmented\ngeneration, and then it reviews notable approaches according to di

In [22]:
from langchain.chat_models import ChatOpenAI
# Other necessary imports...

api_key = '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'

answer_generation_llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0, openai_api_key=api_key)


answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a University Professor creating a test for advanced students. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [23]:
for k, v in output_dict.items():
  print(k)
  print(v)

answer
The advantages of retrieval-augmented text generation compared to conventional generation models include the ability to incorporate external knowledge, which can lead to more informative and contextually relevant outputs. This approach has achieved state-of-the-art performance in various NLP tasks by leveraging retrieved information to enhance the generation process. Additionally, it can improve the diversity and factual accuracy of the generated text, as it can draw upon a wide range of sources for content enrichment.
question
What are the advantages of retrieval-augmented text generation compared to conventional generation models?


In [24]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

100%|██████████| 10/10 [00:55<00:00,  5.59s/it]


In [25]:
!pip install -q -U datasets

In [26]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})


eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
eval_dataset

Dataset({
    features: ['question', 'context', 'ground_truth'],
    num_rows: 10
})

In [28]:
eval_dataset[0]

{'question': 'What are the advantages of retrieval-augmented text generation compared to conventional generation models?',
 'context': 'A Survey on Retrieval-Augmented Text Generation\nHuayang Li♥,∗\nYixuan Su♠,∗\nDeng Cai♦,∗\nYan Wang♣,∗\nLemao Liu♣,∗\n♥Nara Institute of Science and Technology\n♠University of Cambridge\n♦The Chinese University of Hong Kong\n♣Tencent AI Lab',
 'ground_truth': 'The advantages of retrieval-augmented text generation compared to conventional generation models include the ability to incorporate external knowledge, which can lead to more informative and contextually relevant outputs. This approach has shown state-of-the-art performance in various NLP tasks by leveraging retrieved content to enhance the generation process. Additionally, it can improve the diversity and factual accuracy of the generated text, as it can draw upon a wide range of sources for information.'}

In [29]:
eval_dataset.to_csv("groundtruth_eval_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  9.55ba/s]


7836

In [30]:
# from datasets import Dataset
# eval_dataset = Dataset.from_csv("groundtruth_eval_dataset.csv")

In [31]:
eval_dataset

Dataset({
    features: ['question', 'context', 'ground_truth'],
    num_rows: 10
})

Evaluation Using RAGAS

The set-up is fairly straightforward - we simply need to create a dataset with our generated answers and our contexts, and then evaluate using the framework.

In [33]:


OPENAI_API_KEY = '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'
openai.api_key=OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY


# openai_embeddings = OpenAIEmbeddings(api_key=api_key)


from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result