## RAG Evaluation

In [4]:
# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'

from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader("state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n", " "],
    chunk_size = 1000,
    chunk_overlap = 10,
    is_separator_regex=False
)

chunk_list = text_splitter.create_documents(texts=[docs[0].page_content])

In [6]:
embeddings =  HuggingFaceEmbeddings()
db = FAISS.from_documents(chunk_list, embeddings)
print(db.index.ntotal)

45


In [9]:
query ="What did the president say about gun violence?"
docs = db.similarity_search(query)

In [10]:
docs 

[Document(page_content='violence interruption trusted messengers breaking the cycle of violence and trauma and giving young people hope.  ","","We should all agree: The answer is not to Defund the police. The answer is to FUND the police with the resources and training they need to protect our communities. ","","I ask Democrats and Republicans alike: Pass my budget and keep our neighborhoods safe.  ","","And I will keep doing everything in my power to crack down on gun trafficking and ghost guns you can buy online and make at home they have no serial numbers and can t be traced. ","","And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon? ","","Ban assault weapons and high-capacity magazines. ","","Repeal the liability shield that makes gun manufacturers the only industry in America that can t be sued. ","","These laws don t infringe on the Second Amendment. They save lives

In [11]:
retriever = db.as_retriever()

In [12]:
#!pip install -U langchain-community langchain-cohere

In [13]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment="gpt35turbo16kdeployment",
    api_version="2024-05-01-preview",
    api_key="-------------------------", #PUT YOUR API KEY HERE
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [20]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever= db.as_retriever(search_kwargs={"k":2}), return_source_documents=True)

In [21]:
response = qa("What did the president say about gun violence?")


In [22]:
response['result']

'The president said that he will do everything in his power to crack down on gun trafficking and ghost guns. He also asked Congress to pass measures such as universal background checks, banning assault weapons and high-capacity magazines, and repealing the liability shield for gun manufacturers. He emphasized that these laws do not infringe on the Second Amendment but rather save lives.'

## Ragas For RAG Evaluation

### Creating dataset

##### Creating QA pairs

In [35]:
from datasets import Dataset
questions = [
    "What did last year’s American Rescue Plan accomplish?",
    "What did the president say about Intel ?",
    "What did the president say about gun violence?"]

ground_truths = [
    "The American Rescue Plan provided economic relief, helped vaccinate the nation, and created over 6.5 million new jobs.",
    "The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion.",
     "The president asked Congress to pass proven measures to reduce gun violence."]


##### Querying the LLM for Retrieval Augmented Generations and fetching contexts used

In [36]:
answers=[]
contexts=[]
for query in questions:
    
    
    print("Query:", query)
    
    answer = qa.invoke(query)
    print("Answer:", answer)
    
    retrieved_docs = retriever.get_relevant_documents(query)
    
    print("Retrieved Docs:", retrieved_docs)
    
    context_texts = ' '.join([doc.page_content for doc in retrieved_docs])
    print("Context Texts:", [context_texts])
    
    answers.append(answer)
    contexts.append(context_texts)


Query: What did last year’s American Rescue Plan accomplish?
Answer: {'query': 'What did last year’s American Rescue Plan accomplish?', 'result': "Last year's American Rescue Plan accomplished several things. It provided immediate economic relief for millions of Americans, helping them with food, housing, and healthcare costs. It also played a crucial role in the nation's vaccination efforts and combating the COVID-19 pandemic. Additionally, the plan created over 6.5 million new jobs, leading to significant job growth. The economy grew at a rate of 5.7%, the strongest in nearly 40 years. Overall, the American Rescue Plan aimed to address the economic challenges faced by working people and provide them with some breathing room.", 'source_documents': [Document(page_content='breathing room. ","","And unlike the $2 Trillion tax cut passed in the previous administration that benefitted the top 1% of Americans, the American Rescue Plan helped working people and left no one behind. ","","And 

In [2]:
#contexts

##### Results for the given queries

In [37]:
results = [answer['result'] + "                                                     " for answer in answers]
print(results)

["Last year's American Rescue Plan accomplished several things. It provided immediate economic relief for millions of Americans, helping them with food, housing, and healthcare costs. It also played a crucial role in the nation's vaccination efforts and combating the COVID-19 pandemic. Additionally, the plan created over 6.5 million new jobs, leading to significant job growth. The economy grew at a rate of 5.7%, the strongest in nearly 40 years. Overall, the American Rescue Plan aimed to address the economic challenges faced by working people and provide them with some breathing room.                                                     ", "The president mentioned that Intel, the American company that helped build Silicon Valley, is planning to build a $20 billion semiconductor mega site. He also mentioned that Intel's CEO is ready to increase their investment from $20 billion to $100 billion, which would be one of the biggest investments in manufacturing in American history. The presid

#### RAGAS for RAG Evaluation

RAGAS is a powerful framework designed to evaluate the performance of Retrieval Augmented Generation (RAG) pipelines. It provides a structured approach to assessing various components of RAG systems, including the retriever and the generator.

Some of the metrics are as follows:

**context_precision:** Measures how relevant the retrieved context is to the query.

**context_recall:** Determines how much relevant information from the ground truth is covered by the retrieved context.

**answer_relevancy:** Assesses how relevant the generated answer is to the given query.

**faithfulness:** Evaluates how well the generated answer aligns with the information in the retrieved context.

**harmfulness:** Measures the potential negative impact of the generated answer, such as toxicity or bias.

#### Set the metrics to be used and Create Dataset

In [38]:
import pandas as pd
import asyncio
import time

# Import Mertics

from ragas.metrics import (
    context_precision,
    answer_relevancy,
    faithfulness,
    context_recall,
)
from ragas.metrics.critique import harmfulness

# list of metrics we're going to use
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    harmfulness,
]
from ragas import evaluate
from datasets import Dataset

# Sample DataFrame
data = {
    "question": questions,
    "answer": results,
    'contexts': contexts,
    'ground_truth': ground_truths
}

df = pd.DataFrame(data)

# Ensure 'contexts' is of type Sequence[string]
df['contexts'] = df['contexts'].apply(lambda x: [x] if isinstance(x, str) else x)

# Convert DataFrame to dataset format expected by Ragas
dataset = Dataset.from_pandas(df)


In [39]:
df 

Unnamed: 0,question,answer,contexts,ground_truth
0,What did last year’s American Rescue Plan acco...,Last year's American Rescue Plan accomplished ...,"[breathing room. "","""",""And unlike the $2 Trill...",The American Rescue Plan provided economic rel...
1,What did the president say about Intel ?,"The president mentioned that Intel, the Americ...","[won t look like much, but if you stop and loo...",The president said that Pat Gelsinger is ready...
2,What did the president say about gun violence?,The president said that he will do everything ...,[violence interruption trusted messengers brea...,The president asked Congress to pass proven me...


In [40]:
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 3
})

#### Call Evaluate Function to Evaluate the Generated Responses

In [41]:
result = evaluate(
    dataset, metrics=metrics, llm=llm, embeddings=embeddings
)

result

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

{'faithfulness': 1.0000, 'answer_relevancy': 0.6572, 'context_recall': 1.0000, 'context_precision': 1.0000, 'harmfulness': 0.0000}

In [42]:
df = result.to_pandas()
df.head()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,harmfulness
0,What did last year’s American Rescue Plan acco...,Last year's American Rescue Plan accomplished ...,"[breathing room. "","""",""And unlike the $2 Trill...",The American Rescue Plan provided economic rel...,1.0,0.86316,1.0,1.0,0
1,What did the president say about Intel ?,"The president mentioned that Intel, the Americ...","[won t look like much, but if you stop and loo...",The president said that Pat Gelsinger is ready...,1.0,0.384568,1.0,1.0,0
2,What did the president say about gun violence?,The president said that he will do everything ...,[violence interruption trusted messengers brea...,The president asked Congress to pass proven me...,1.0,0.72396,1.0,1.0,0


**Inference:** The evaluation shows high faithfulness (1.0) and perfect context recall and precision (1.0) across all QA pairs. Answer relevancy varies, with the American Rescue Plan question scoring highest (0.863160). All answers are free from harmful content, with the lowest relevancy score for the Intel question (0.384568).

# APPENDIX

### RAG Evaluation with amnesty_qa

The explodinggradients/amnesty_qa dataset is available on Hugging Face and is designed for evaluating question-answering (QA) systems, focusing on human rights and ethical considerations. It provides a set of question-answer pairs that are particularly useful for assessing QA models in contexts related to these themes.

In [317]:
# data
from datasets import load_dataset

amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2",trust_remote_code=True)
#amnesty_qa

Repo card metadata block was not found. Setting CardData to empty.


In [196]:
print(amnesty_qa)

DatasetDict({
    eval: Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 20
    })
})


In [198]:
import os

#assuming you already have you key available via your environment variable. If not use this
os.environ["AZURE_OPENAI_API_KEY"] = "API KEY"

In [199]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment="gpt35turbo16kdeployment",
    api_version="2024-05-01-preview",
    api_key="---------------------",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [200]:
from ragas.metrics import (
    context_precision,
    answer_relevancy,
    faithfulness,
    context_recall,
)
from ragas.metrics.critique import harmfulness

# list of metrics we're going to use
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    harmfulness,
]

In [201]:
result = evaluate(
    amnesty_qa["eval"], metrics=metrics, llm=llm, embeddings=embeddings
)

result

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

{'faithfulness': 0.5574, 'answer_relevancy': 0.9250, 'context_recall': 0.8851, 'context_precision': 0.9458, 'harmfulness': 0.0000}

In [192]:
df = result.to_pandas()
df.head()

Unnamed: 0,question,ground_truth,answer,contexts,faithfulness,answer_relevancy,context_recall,context_precision,harmfulness
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ...",0.782609,0.985554,1.0,1.0,0
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[In recent years, there has been increasing pr...",0.375,0.871604,1.0,1.0,0
2,Which private companies in the Americas are th...,The largest private companies in the Americas ...,"According to the Carbon Majors database, the l...",[The issue of greenhouse gas emissions has bec...,0.0,0.990155,1.0,0.833333,0
3,What action did Amnesty International urge its...,Amnesty International urged its supporters to ...,Amnesty International urged its supporters to ...,"[In the case of the Ogoni 9, Amnesty Internati...",0.6,0.773574,1.0,1.0,0
4,What are the recommendations made by Amnesty I...,The recommendations made by Amnesty Internatio...,Amnesty International made several recommendat...,"[In recent years, Amnesty International has fo...",0.65,0.988373,1.0,1.0,0


In [202]:
df.to_csv("Eval_Results.csv")