<a href="https://colab.research.google.com/github/maneeha/property-graph/blob/main/Evaluate_RAG_on_Synthetic_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook explores the evaluation of RAG-based systems by generating synthetic datasets and using the Ragas framework to compute evaluation metrics on it.


This notebook can be run as-is assuming minor configuration on your end:
- A GCP account with access to VertexAI
- A Pinecone account

In [4]:
!pip install langchain
!pip install pinecone-client
!pip install wikipedia
!pip install datasets
!pip install ragas

! wget -O testset_generator.py https://raw.githubusercontent.com/ahmedbesbes/rag-evaluation-synthetic/main/testset_generator.py
! wget -O rag.py https://raw.githubusercontent.com/ahmedbesbes/rag-evaluation-synthetic/main/rag.py

--2024-06-16 11:42:15--  https://raw.githubusercontent.com/ahmedbesbes/rag-evaluation-synthetic/main/testset_generator.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5224 (5.1K) [text/plain]
Saving to: ‘testset_generator.py’


2024-06-16 11:42:15 (65.7 MB/s) - ‘testset_generator.py’ saved [5224/5224]

--2024-06-16 11:42:15--  https://raw.githubusercontent.com/ahmedbesbes/rag-evaluation-synthetic/main/rag.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2804 (2.7K) [text/plain]
Saving to: ‘rag.py’


2024-06-16 11:42:15 (36.4 

In [5]:
import logging
import click
from rich.logging import RichHandler

LOGGER_NAME = "custom-rag"
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[RichHandler(rich_tracebacks=True, tracebacks_suppress=[click])],
)
logger = logging.getLogger(LOGGER_NAME)
logging.getLogger("numexpr").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)

In [6]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [8]:
import os
import sys

import pandas as pd
import pinecone
import tqdm
from langchain.chat_models import ChatVertexAI
from langchain.document_loaders import WikipediaLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone

from testset_generator import TestsetGenerator
from rag import RAG

pd.set_option("display.max_colwidth", None)


### Load data from Wikipedia

In [None]:
topic = "python programming"

wikipedia_load = WikipediaLoader(
    query=topic,
    load_max_docs=1,
    doc_content_chars_max=100000,
)
docs = wikipedia_load.load()
doc = docs[0]

### Index data into Pinecone

In [None]:
index_name = topic.replace(" ", "-")

pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),
    environment=os.environ.get("PINECONE_ENV"),
)


if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

pinecone.create_index(index_name, dimension=768)

index = pinecone.Index(index_name)

logger.info(f"Index {index_name} created successfully")
logger.info(index.describe_index_stats())

In [None]:
CHUNK_SIZE = 512
CHUNK_OVERLAP = 128

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=[". "],
)

splits = splitter.split_documents([doc])

In [None]:
embedding_model = VertexAIEmbeddings()
docsearch = Pinecone.from_documents(
    splits,
    embedding_model,
    index_name=index_name,
)

### Create synthetic dataset

In [None]:
generator_llm = VertexAI(
    location="europe-west3",
    max_output_tokens=256,
    max_retries=20,
)
embedding_model = VertexAIEmbeddings()

In [None]:
testset_generator = TestsetGenerator(
    generator_llm=generator_llm,
    documents=splits,
    embedding_model=embedding_model,
    index_name=index_name,
    key="text",
)

In [None]:
synthetic_dataset = testset_generator.generate(
    test_size=10,
    num_questions_per_context=2,
)

In [None]:
for i, row in synthetic_dataset.sample(3).iterrows():
    print(f"question: {row['question']}")
    print(f"answer: {row['ground_truths']}")
    print("\n====\n")

### Generate Answers with the RAG

In [None]:
llm = VertexAI(
    model_name="text-bison",
    max_output_tokens=256,
    temperature=0,
    top_p=0.95,
    top_k=40,
    verbose=True,
)

In [None]:
rag = RAG(
    index_name,
    "text-bison",
    embedding_model,
    "text",
)

In [None]:
rag_answers = []
contexts = []

for i, row in tqdm.tqdm(synthetic_dataset.iterrows(), total=len(synthetic_dataset)):
    question = row["question"]
    prediction = rag.predict(question)

    rag_answer = prediction["answer"]
    rag_answers.append(rag_answer)
    source_documents = prediction["source_documents"]
    contexts.append([s.page_content for s in source_documents])

In [None]:
synthetic_dataset_rag = synthetic_dataset.copy()
synthetic_dataset_rag["answer"] = rag_answers
synthetic_dataset_rag["contexts"] = contexts

In [None]:
synthetic_dataset_rag.sample(1)

### Evaluate synthetic dataset

In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLM
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    answer_similarity,
    context_precision,
    context_recall,
    context_relevancy,
    faithfulness,
)

synthetic_ds_rag = Dataset.from_pandas(synthetic_dataset_rag)

In [None]:
generator_llm = VertexAI(max_output_tokens=256, max_retries=10)
ragas_vertexai_llm = LangchainLLM(llm=generator_llm)
vertexai_embeddings = VertexAIEmbeddings()

In [None]:
metrics = [
    answer_relevancy,
    context_precision,
    faithfulness,
    answer_correctness,
    answer_similarity,
]

for m in metrics:
    m.__setattr__("llm", ragas_vertexai_llm)
    if hasattr(m, "embeddings"):
        m.__setattr__("embeddings", vertexai_embeddings)

answer_correctness.faithfulness = faithfulness
answer_correctness.answer_similarity = answer_similarity

In [None]:
results_rag = evaluate(
    synthetic_ds_rag,
    metrics=[
        answer_relevancy,
        context_precision,
        faithfulness,
        answer_correctness,
    ],
)

In [None]:
print(results_rag)

In [None]:
results_rag.to_pandas()