[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mongodb-developer/GenAI-Showcase/blob/main/notebooks/evals/ragas-evaluation.ipynb)

# RAG Series Part 2: How to evaluate your RAG application

This notebook shows how to evaluate a RAG application using the [RAGAS](https://docs.ragas.io/en/stable/index.html) framework.

## Step 1: Install required libraries

In [1]:
! pip install -qU ragas datasets pandas langchain langchain-mongodb pymongo tqdm langchain-anthropic

## Step 2: Setup pre-requisites

* Set the MongoDB connection string. Follow the steps [here](https://www.mongodb.com/docs/manual/reference/connection-string/) to get the connection string from the Atlas UI.

* Set the OpenAI API key. Steps to obtain an API key as [here](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key)

In [2]:
import os
import getpass
from openai import OpenAI

In [3]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")
openai_client = OpenAI()

Enter your OpenAI API Key:········


In [4]:
MONGODB_URI = getpass.getpass("Enter your MongoDB connection string:")

Enter your MongoDB connection string:········


In [21]:
os.environ["ANTHROPIC_API_KEY"] = getpass.getpass("Enter your Anthropic API Key:")

Enter your Anthropic API Key:········


## Step 3: Download the Hugging Face dataset

In [5]:
from datasets import load_dataset
import pandas as pd

In [None]:
data = load_dataset("explodinggradients/ragas-wikiqa", split="train", streaming=True)
data_head = data.take(50)
df = pd.DataFrame(data_head)

In [6]:
data = load_dataset("explodinggradients/ragas-wikiqa", split="train")
df = pd.DataFrame(data)

In [7]:
df.head(1)

Unnamed: 0,question,correct_answer,incorrect_answer,question_id,generated_with_rag,context,generated_without_rag
0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"As such, African immigrants are to be distingu...",From the Immigration and Nationality Act of 19...,Q0,\nAfrican Americans were immigrated to the Uni...,[African immigration to the United States refe...,African Americans were immigrated to the US in...


In [8]:
len(df)

232

## Step 4: Chunk up the documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    keep_separator=False,
    chunk_size=200,
    chunk_overlap=30)

In [None]:
def split_texts(texts):
    chunked_texts = []
    for text in texts:
        chunks = text_splitter.create_documents([text])
        chunked_texts.extend([chunk.page_content for chunk in chunks]) 
    return chunked_texts

In [None]:
df["chunks"] = df["context"].apply(lambda x: split_texts(x))

In [None]:
all_chunks = df["chunks"].tolist()
docs = []
for chunk in all_chunks:
    docs.extend(chunk)

In [None]:
len(docs)

In [None]:
docs[100]

## Step 5: Create embeddings and ingest them into MongoDB

In [9]:
from typing import List
from pymongo import MongoClient
from tqdm.auto import tqdm

In [None]:
def get_embeddings(
    docs: List[str], model: str = "text-embedding-3-large"
) -> List[List[float]]:
    """
    Generate embeddings using the OpenAI API.

    Args:
        docs (List[str]): List of texts to embed
        model (str, optional): Model name. Defaults to "text-embedding-3-large".

    Returns:
        List[float]: Array of embeddings
    """
    # replace newlines, which can negatively affect performance.
    docs = [doc.replace("\n", " ") for doc in docs]
    response = openai_client.embeddings.create(input=docs, model=model)
    response = [r.embedding for r in response.data]
    return response

In [10]:
client = MongoClient(MONGODB_URI)
DB_NAME = "ragas_evals"
db = client[DB_NAME]

In [11]:
batch_size = 128

In [12]:
EVAL_EMBEDDING_MODELS = ["text-embedding-ada-002", "text-embedding-3-small"]

In [None]:
for model in EVAL_EMBEDDING_MODELS:
    embedded_docs = []
    print(f"Getting embeddings for the {model} model")
    for i in tqdm(range(0, len(docs), batch_size)):
        end = min(len(docs), i + batch_size)
        batch = docs[i:end]
        # Generate embeddings for current batch
        batch_embeddings = get_embeddings(batch, model)
        batch_embedded_docs = [{"text": batch[i], "embedding": batch_embeddings[i]} for i in range(len(batch))]
        embedded_docs.extend(batch_embedded_docs)
    print(f"Finished getting embeddings for the {model} model")
    
    print(f"Inserting embeddings for the {model} model")
    collection = db[model]
    collection.delete_many({})
    collection.insert_many(embedded_docs)
    print(f"Finished inserting embeddings for the {model} model")

## Step 6: Compare Embeddings for the Retriever

In [13]:
from langchain_openai import OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from datasets import Dataset
from ragas import evaluate, RunConfig
from ragas.metrics import context_precision, context_recall
import nest_asyncio

nest_asyncio.apply()

In [14]:
def get_retriever(model, k):
    embeddings = OpenAIEmbeddings(model=model)
    
    vector_store = MongoDBAtlasVectorSearch.from_connection_string(
    connection_string=MONGODB_URI,
    namespace=f"{DB_NAME}.{model}",
    embedding= embeddings,
    index_name="vector_index",
    text_key="text"
    )
    
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k})
    return retriever

In [15]:
QUESTIONS = df["question"].to_list()
GROUND_TRUTH = df["correct_answer"].tolist()

In [16]:
for model in EVAL_EMBEDDING_MODELS:
    data = {"question": [], "ground_truth": [], "contexts": []}
    data["question"] = QUESTIONS
    data["ground_truth"] = GROUND_TRUTH

    retriever = get_retriever(model, 2)
    for i in tqdm(range(0, len(QUESTIONS))):
        data["contexts"].append([doc.page_content for doc in retriever.get_relevant_documents(QUESTIONS[i])])
        
    dataset = Dataset.from_dict(data)
    run_config = RunConfig(max_workers=4, max_wait=180)
    result= evaluate(dataset=dataset, metrics=[context_precision, context_recall], run_config=run_config, raise_exceptions=False)
    print(f"Result for the {model} model: {result}")

  0%|          | 0/232 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/464 [00:00<?, ?it/s]

Failed to parse output. Returning None.


Result for the text-embedding-ada-002 model: {'context_precision': 0.9267, 'context_recall': 0.8423}


  0%|          | 0/232 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/464 [00:00<?, ?it/s]

Result for the text-embedding-3-small model: {'context_precision': 0.9116, 'context_recall': 0.8806}


## Step 7: Compare Completion Models for the Generator

In [18]:
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from ragas.metrics import faithfulness, answer_relevancy

In [46]:
def get_rag_chain(retriever, model):
    # Generate context using the retriever, and pass the user question through
    retrieve = {"context": retriever | (lambda docs: "\n\n".join([d.page_content for d in docs])), "question": RunnablePassthrough()}
    template = """Answer the question based only on the following context: \
    {context}

    Question: {question}
    """
    # Defining the chat prompt
    prompt = ChatPromptTemplate.from_template(template)
    # Defining the model to be used for chat completion
    llm = ChatOpenAI(temperature=0, model=model)
    # Parse output as a string
    parse_output = StrOutputParser()

    # Naive RAG chain 
    rag_chain = (
        retrieve
        | prompt
        | llm
        | parse_output
    )
    return rag_chain

In [None]:
for model in ["gpt-3.5-turbo-1106", "gpt-3.5-turbo"]:
    data = {"question": [], "ground_truth": [], "contexts": [], "answer": []}
    data["question"] = QUESTIONS
    data["ground_truth"] = GROUND_TRUTH

    retriever = get_retriever("text-embedding-3-small", 2)
    rag_chain = get_rag_chain(retriever, model)
    for i in tqdm(range(0, len(QUESTIONS))):
        question = QUESTIONS[i]
        data["answer"].append(rag_chain.invoke(question))
        data["contexts"].append([doc.page_content for doc in retriever.get_relevant_documents(question)])
        
    dataset = Dataset.from_dict(data)
    run_config = RunConfig(max_workers=4, max_wait=180)
    result= evaluate(dataset=dataset, metrics=[faithfulness, answer_relevancy], run_config=run_config, raise_exceptions=False)
    print(f"Result for the {model} model: {result}")

  0%|          | 0/232 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/464 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.


Result for the gpt-3.5-turbo-1106 model: {'faithfulness': 0.9578, 'answer_relevancy': 0.9162}


  0%|          | 0/232 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/464 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.


Result for the gpt-3.5-turbo model: {'faithfulness': 0.9721, 'answer_relevancy': 0.9131}


In [37]:
result_df = result.to_pandas()

In [38]:
result_df.head(5)

Unnamed: 0,question,ground_truth,contexts,answer,faithfulness,answer_relevancy
0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"As such, African immigrants are to be distingu...",[African immigration to the United States refe...,African Americans were involuntarily brought f...,1.0,0.902984
1,what are points on a mortgage,"Points, sometimes also called a ""discount poin...","[Discount points, also called mortgage points ...",Points on a mortgage are a form of pre-paid in...,1.0,0.976646
2,how does interlibrary loan work,The user makes a request with their local libr...,"[After receiving a request from their patron, ...",Interlibrary loan works by allowing patrons of...,1.0,0.975809
3,WHAT IS A FY QUARTER,"A fiscal year (or financial year, or sometimes...",[1st quarter: 1 October 2022 – 31 December 202...,A FY quarter refers to a quarter within a fina...,1.0,0.943946
4,who wrote a rose is a rose is a rose,"The sentence ""Rose is a rose is a rose is a ro...","[The sentence ""Rose is a rose is a rose is a r...",Gertrude Stein,1.0,0.929579


In [39]:
result_df[result_df["answer_relevancy"] < 0.5]

Unnamed: 0,question,ground_truth,contexts,answer,faithfulness,answer_relevancy
91,when did dr.carter g woodson die,"Carter Godwin Woodson (December 19, 1875April ...","[Carter G. Woodson was born in New Canton, Vir...","Based on the context provided, there is no inf...",,0.0


In [40]:
result_df.iloc[91]["contexts"]

array(["Carter G. Woodson was born in New Canton, Virginia, on December 19, 1875, the son of former slaves Anne Eliza (Riddle) and James Henry Woodson. Although his father was illiterate, Carter's mother, Anna, had been taught to read by her mistress. His father, James, during the Civil War, had helped Union soldiers near Richmond, after escaping from his owner, by leading them to Confederate supply stations and warehouses to raid army supplies. Thereafter, and until the war ended, James had",
       'His Washington, D.C. home has been preserved and designated the Carter G. Woodson Home National Historic Site.\nIn 2002, scholar Molefi Kete Asante named Carter G. Woodson on his list of 100 Greatest African Americans.\nIn 2015, a bronze statue of Woodson was placed in the park named for him in Washington, D.C.\nOn February 1, 2018, he was honored with a Google Doodle.'],
      dtype=object)

## Step 8: Measure overall performance of the RAG system

In [44]:
from ragas.metrics import answer_similarity, answer_correctness

In [47]:
data = {"question": [], "ground_truth": [], "answer": []}
data["question"] = QUESTIONS
data["ground_truth"] = GROUND_TRUTH

retriever = get_retriever("text-embedding-3-small", 2)
rag_chain = get_rag_chain(retriever, "gpt-3.5-turbo")
for i in tqdm(range(0, len(QUESTIONS))):
    question = QUESTIONS[i]
    data["answer"].append(rag_chain.invoke(question))

dataset = Dataset.from_dict(data)
run_config = RunConfig(max_workers=4, max_wait=180)
result= evaluate(dataset=dataset, metrics=[answer_similarity, answer_correctness], run_config=run_config, raise_exceptions=False)
print(f"Overall metrics: {result}")

  0%|          | 0/232 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/464 [00:00<?, ?it/s]

Overall metrics: {'answer_similarity': 0.8889, 'answer_correctness': 0.5981}
