# RAG Ecosystem


## Table of Contents

- [Understanding Basic RAG System](#part1)
  - [Indexing Phase](#part1-1)
  - [Retrieval](#part1-2)
  - [Generation](#part1-3)
- [Advanced Query Transformations](#part2)
  - [Multi-Query Generation](#part2-1)
  - [RAG-Fusion](#part2-2)
  - [Decomposition](#part2-3)
  - [Step-Back Prompting](#part2-4)
  - [HyDE](#part2-5)
- [Routing & Query Construction](#part3)
  - [Logical Routing](#part3-1)
  - [Semantic Routing](#part3-2)
  - [Query Structuring](#part3-3)
- [Advanced Indexing Strategies](#part4)
  - [Multi-Representation Indexing](#part4-1)
  - [Hierarchical Indexing (RAPTOR) Knowledge Tree](#part4-2)
  - [Token-Level Precision (ColBERT)](#part4-3)
- [Advanced Retrieval & Generation](#part5)
  - [Dedicated Re-ranking](#part5-1)
  - [Self-Correction using AI Agents](#part5-2)
  - [Impact of Long Context](#part5-3)
- [Manual RAG Evaluation](#part6)
  - [The Core Metrics: What Should We Measure?](#part6-1)
  - [Building Evaluators from Scratch with LangChain](#part6-2)
- [Evaluation with Frameworks](#part7)
  - [Rapid Evaluation with deepeval](#part7-1)
  - [Another Powerful Alternative with grouse](#part7-2)
  - [Evaluation with RAGAS](#part7-3)

<a id='part1'></a>
# Basic RAG System



## Requirements

In [None]:
import os
import bs4
import uuid
import datetime
import requests

from typing import Literal, Optional
from operator import itemgetter

# torch import
import torch

# LangChain imports
from langchain import hub
from langchain.load import dumps, loads
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.utils.math import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain.embeddings import FastEmbedEmbeddings
from langchain.schema import Document

# LangChain Community imports
from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader
from langchain_chroma import Chroma

# LangChain Core imports
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from pydantic import BaseModel, Field

# LangChain OpenAI imports
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# Evaluation framework imports
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval

from grouse import EvaluationSample, GroundedQAEvaluator

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    answer_correctness,
)

# RAGatouille import
from fastembed import TextEmbedding

In [None]:
# Set LangChain API endpoint and API key for tracing with LangSmith
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY") 

# Set OpenAI API key for using OpenAI models
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

# Set Cohere API key for using Cohere
os.environ['COHERE_API_KEY'] = os.getenv("COHERE_API_KEY") 

# Set User Agent
os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"

# Uncomment to use CPU
device = torch.device("cpu")

<a id='part1-1'></a>
## Indexing Phase



In [None]:
# Initialize a web document loader with specific parsing instructions
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),  # URL of the blog post to load
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")  # Only parse specified HTML classes
        )
    ),
)

# Load the filtered content from the web page into documents
docs = loader.load()

In [None]:
# Create a text splitter to divide text into chunks of 1000 characters with 200-character overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the loaded documents into smaller chunks
splits = text_splitter.split_documents(docs)

In [None]:
# Embed the text chunks and store them in a Chroma vector store for similarity search
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings()  # Use OpenAI's embedding model to convert text into vectors
)

In [None]:
# Create a retriever from the vector store
retriever = vectorstore.as_retriever()

In [None]:
# Retrieve relevant documents for a query
docs = retriever.invoke("What is Task Decomposition?")

# Print the content of the first retrieved document
print(docs[0].page_content)

<a id='part1-3'></a>
## Generation


In [None]:
# Pull a pre-made RAG prompt from LangChain Hub
prompt = hub.pull("rlm/rag-prompt")

# printing the prompt
print(prompt)

In [None]:
# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [None]:
# Helper function to format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define the full RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Ask a question using the RAG chain
response = rag_chain.invoke("What is Task Decomposition?")
print(response)

<a id='part2'></a>
# Advanced Query Transformations


In [None]:
# Load the blog post
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50
)
splits = text_splitter.split_documents(blog_docs)

# Index the chunks in a Chroma vector store
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

# Create our retriever
retriever = vectorstore.as_retriever()

<a id='part2-1'></a>
## Multi-Query Generation


In [None]:
# Prompt for generating multiple queries
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

# Chain to generate the queries
generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
question = "What is task decomposition for LLM agents?"
generated_queries_list = generate_queries.invoke({"question": question})

# Print the generated queries
for i, q in enumerate(generated_queries_list):
    print(f"{i+1}. {q}")

In [None]:
def get_unique_union(documents: list[list]):
    """ A simple function to get the unique union of retrieved documents """
    # Flatten the list of lists and convert each Document to a string for uniqueness
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

# Build the retrieval chain
retrieval_chain = generate_queries | retriever.map() | get_unique_union

# Invoke the chain and check the number of documents retrieved
docs = retrieval_chain.invoke({"question": question})
print(f"Total unique documents retrieved: {len(docs)}")

In [None]:
# The final RAG chain
template = """Answer the following question based on this context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

print(final_rag_chain.invoke({"question": question}))

<a id='part2-2'></a>
## RAG-Fusion



In [None]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal Rank Fusion that intelligently combines multiple ranked lists """
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # The core of RRF: documents ranked higher (lower rank value) get a larger score
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort documents by their new fused scores in descending order
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [None]:
# Use a slightly different prompt for RAG-Fusion
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

# Build the new retrieval chain with RRF
retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})

print(f"Total re-ranked documents retrieved: {len(docs)}")

<a id='part2-3'></a>
## Decomposition


In [None]:
# Decomposition prompt
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

# Chain to generate sub-questions
generate_queries_decomposition = (
    prompt_decomposition 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

# Generate and print the sub-questions
question = "What are the main components of an LLM-powered autonomous agent system?"
sub_questions = generate_queries_decomposition.invoke({"question": question})
print(sub_questions)

In [None]:
# RAG prompt
prompt_rag = hub.pull("rlm/rag-prompt")

# A list to hold the answers to our sub-questions
rag_results = []
for sub_question in sub_questions:
    # Retrieve documents for each sub-question
    retrieved_docs = retriever.invoke(sub_question)
    
    # Use our standard RAG chain to answer the sub-question
    answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, "question": sub_question})
    rag_results.append(answer)

def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

# Format the Q&A pairs into a single context string
context = format_qa_pairs(sub_questions, rag_results)

# Final synthesis prompt
template = """Here is a set of Q+A pairs:

{context}

Use these to synthesize an answer to the original question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

print(final_rag_chain.invoke({"context": context, "question": question}))

In [None]:
# Few-shot examples to teach the model how to generate step-back (more generic) questions
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel's was born in what country?",
        "output": "what is Jan Sindel's personal history?",
    },
]

# Define how each example is formatted in the prompt
example_prompt = ChatPromptTemplate.from_messages([
    ("human", "{input}"),  # User input
    ("ai", "{output}")     # Model's response
])

# Wrap the few-shot examples into a reusable prompt template
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

# Full prompt includes system instruction, few-shot examples, and the user question
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are an expert at world knowledge. Your task is to step back and paraphrase a question "
     "to a more generic step-back question, which is easier to answer. Here are a few examples:"),
    few_shot_prompt,
    ("user", "{question}"),
])

In [None]:
# Define a chain to generate step-back questions using the prompt and an OpenAI model
generate_queries_step_back = prompt | ChatOpenAI(temperature=0) | StrOutputParser()

# Run the chain on a specific question
question = "What is task decomposition for LLM agents?"
step_back_question = generate_queries_step_back.invoke({"question": question})

# Output the original and generated step-back question
print(f"Original Question: {question}")
print(f"Step-Back Question: {step_back_question}")

In [None]:
# Prompt for the final response
response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

# Normal Context
{normal_context}

# Step-Back Context
{step_back_context}

# Original Question: {question}
# Answer:"""
response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

# The full chain
chain = (
    {
        # Retrieve context using the normal question
        "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
        # Retrieve context using the step-back question
        "step_back_context": generate_queries_step_back | retriever,
        # Pass on the original question
        "question": lambda x: x["question"],
    }
    | response_prompt
    | ChatOpenAI(temperature=0)
    | StrOutputParser()
)

response = chain.invoke({"question": question})

In [None]:
print(response)

<a id='part2-5'></a>
## HyDE


In [None]:
# HyDE prompt
template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""
prompt_hyde = ChatPromptTemplate.from_template(template)

# Chain to generate the hypothetical document
generate_docs_for_retrieval = (
    prompt_hyde 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
)

# Generate and print the hypothetical document
hypothetical_document = generate_docs_for_retrieval.invoke({"question": question})
print(hypothetical_document)

In [None]:
# Retrieve documents using the HyDE approach
retrieval_chain = generate_docs_for_retrieval | retriever 
retrieved_docs = retrieval_chain.invoke({"question": question})

# Use our standard RAG chain to generate the final answer from the retrieved context
response = final_rag_chain.invoke({"context": retrieved_docs, "question": question})
print(response)

<a id='part3'></a>
# Routing & Query Construction


<a id='part3-1'></a>
## Logical Routing


In [None]:
# Define the data model for our router's output
class RouteQuery(BaseModel):
    """A data model to route a user query to the most relevant datasource."""

    # The 'datasource' field must be one of the three specified literal strings.
    # This enforces a strict set of choices for the LLM.
    datasource: Literal["python_docs", "js_docs", "golang_docs"] = Field(
        ...,  # The '...' indicates that this field is required.
        description="Given a user question, choose which datasource would be most relevant for answering their question.",
    )

In [None]:
# Initialize our LLM
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

# Create a new LLM instance that is "structured" to output our Pydantic model
structured_llm = llm.with_structured_output(RouteQuery)

# The system prompt provides the core instruction for the LLM's task.
system = """You are an expert at routing a user question to the appropriate data source.

Based on the programming language the question is referring to, route it to the relevant data source."""

# The full prompt template combines the system message and the user's question.
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

# Define the complete router chain
router = prompt | structured_llm

In [None]:
question = """Why doesn't the following code work:

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(["human", "speak in {language}"])
prompt.invoke("french")
"""

# Invoke the router and check the result
result = router.invoke({"question": question})

print(result)

In [None]:
def choose_route(result):
    """A function to determine the downstream logic based on the router's output."""
    if "python_docs" in result.datasource.lower():
        # In a real app, this would be a complete RAG chain for Python docs
        return "chain for python_docs"
    elif "js_docs" in result.datasource.lower():
        # This would be the chain for JavaScript docs
        return "chain for js_docs"
    else:
        # And this for Go docs
        return "chain for golang_docs"

# The full chain now includes the routing and branching logic
full_chain = router | RunnableLambda(choose_route)

# Let's run the full chain
final_destination = full_chain.invoke({"question": question})

print(final_destination)

<a id='part3-2'></a>
## Semantic Routing



In [None]:
# A prompt for a physics expert
physics_template = """You are a very smart physics professor. \
You are great at answering questions about physics in a concise and easy to understand manner. \
When you don't know the answer to a question you admit that you don't know.

Here is a question:
{query}"""

# A prompt for a math expert
math_template = """You are a very good mathematician. You are great at answering math questions. \
You are so good because you are able to break down hard problems into their component parts, \
answer the component parts, and then put them together to answer the broader question.

Here is a question:
{query}"""

In [None]:
# Initialize the embedding model
embeddings = OpenAIEmbeddings()

# Store our templates and their embeddings for comparison
prompt_templates = [physics_template, math_template]
prompt_embeddings = embeddings.embed_documents(prompt_templates)

def prompt_router(input):
    """A function to route the input query to the most similar prompt template."""
    # 1. Embed the incoming user query
    query_embedding = embeddings.embed_query(input["query"])
    
    # 2. Compute the cosine similarity between the query and all prompt templates
    similarity = cosine_similarity([query_embedding], prompt_embeddings)[0]
    
    # 3. Find the index of the most similar prompt
    most_similar_index = similarity.argmax()
    
    # 4. Select the most similar prompt template
    chosen_prompt = prompt_templates[most_similar_index]
    
    print(f"DEBUG: Using {'MATH' if most_similar_index == 1 else 'PHYSICS'} template.")
    
    # 5. Return the chosen prompt object
    return PromptTemplate.from_template(chosen_prompt)

In [None]:
# The final chain that combines the router with the LLM
chain = (
    {"query": RunnablePassthrough()}
    | RunnableLambda(prompt_router)  # Dynamically select the prompt
    | ChatOpenAI()
    | StrOutputParser()
)

# Ask a physics question
print(chain.invoke("What's a black hole"))

In [None]:
# Load a YouTube transcript to inspect its metadata
# docs = YoutubeLoader.from_youtube_url(
#    "https://www.youtube.com/watch?v=pbAd8O1Lvm4", add_video_info=True
#).load()

# Print the metadata of the first document
# print(docs[0].metadata)

In [None]:
class TutorialSearch(BaseModel):
    """A data model for searching over a database of tutorial videos."""

    # The main query for a similarity search over the video's transcript.
    content_search: str = Field(..., description="Similarity search query applied to video transcripts.")
    
    # A more succinct query for searching just the video's title.
    title_search: str = Field(..., description="Alternate version of the content search query to apply to video titles.")
    
    # Optional metadata filters
    min_view_count: Optional[int] = Field(None, description="Minimum view count filter, inclusive.")
    max_view_count: Optional[int] = Field(None, description="Maximum view count filter, exclusive.")
    earliest_publish_date: Optional[datetime.date] = Field(None, description="Earliest publish date filter, inclusive.")
    latest_publish_date: Optional[datetime.date] = Field(None, description="Latest publish date filter, exclusive.")
    min_length_sec: Optional[int] = Field(None, description="Minimum video length in seconds, inclusive.")
    max_length_sec: Optional[int] = Field(None, description="Maximum video length in seconds, exclusive.")

    def pretty_print(self) -> None:
        """A helper function to print the populated fields of the model."""
        for field in self.__fields__:
            if getattr(self, field) is not None:
                print(f"{field}: {getattr(self, field)}")

In [None]:
# System prompt for the query analyzer
system = """You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a database query optimized to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""

prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{question}")])
structured_llm = llm.with_structured_output(TutorialSearch)

# The final query analyzer chain
query_analyzer = prompt | structured_llm

In [None]:
# Test 1: A simple query
query_analyzer.invoke({"question": "rag from scratch"}).pretty_print()

In [None]:
# Test 2: A query with a date filter
query_analyzer.invoke(
    {"question": "videos on chat langchain published in 2023"}
).pretty_print()

In [None]:
# Test 3: A query with a length filter
query_analyzer.invoke(
    {
        "question": "how to use multi-modal models in an agent, only videos under 5 minutes"
    }
).pretty_print()

<a id='part4'></a>
# Advanced Indexing Strategies



<a id='part4-1'></a>
## Multi-Representation Indexing


In [None]:
# Load two different blog posts to create a more diverse knowledge base
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

loader = WebBaseLoader("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/")
docs.extend(loader.load())

print(f"Loaded {len(docs)} documents.")

Next, we’ll create a chain to generate a summary for each of these documents.

In [None]:
# The chain for generating summaries
summary_chain = (
    # Extract the page_content from the document object
    {"doc": lambda x: x.page_content}
    # Pipe it into a prompt template
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    # Use an LLM to generate the summary
    | ChatOpenAI(model="gpt-3.5-turbo", max_retries=0)
    # Parse the output into a string
    | StrOutputParser()
)

# Use .batch() to run the summarization in parallel for efficiency
summaries = summary_chain.batch(docs, {"max_concurrency": 5})

# Let's inspect the first summary
print(summaries[0])

In [None]:
# The vectorstore to index the summary embeddings
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id" # This key will link summaries to their parent documents

# The retriever that orchestrates the whole process
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)

# Generate unique IDs for each of our original documents
doc_ids = [str(uuid.uuid4()) for _ in docs]

# Create new Document objects for the summaries, adding the 'doc_id' to their metadata
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# Add the summaries to the vectorstore
retriever.vectorstore.add_documents(summary_docs)

# Add the original documents to the docstore, linking them by the same IDs
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
query = "Memory in agents"

# First, let's see what the vectorstore finds by searching the summaries
sub_docs = vectorstore.similarity_search(query, k=1)
print("--- Result from searching summaries ---")
print(sub_docs[0].page_content)
print("\n--- Metadata showing the link to the parent document ---")
print(sub_docs[0].metadata)

In [None]:
# Let the full retriever do its job
retrieved_docs = retriever.invoke(query, n_results=1)

# Print the beginning of the retrieved full document
print("\n--- The full document retrieved by the MultiVectorRetriever ---")
print(retrieved_docs[0].page_content[0:500])

<a id='part4-2'></a>
## Hierarchical Indexing (RAPTOR) Knowledge Tree


<a id='part4-3'></a>
## Token-Level Precision



In [None]:
# Fetch Wikipedia content
def get_wikipedia_page(title: str) -> str:
    URL = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }
    headers = {"User-Agent": "MyFastEmbedApp/1.0"}
    response = requests.get(URL, params=params, headers=headers)
    data = response.json()
    page = next(iter(data["query"]["pages"].values()))
    return page.get("extract", "")

full_document = get_wikipedia_page("Hayao_Miyazaki")

# Split the text into smaller chunks (token-level approximation)
# Use small chunk size (e.g., 50-100 characters) to mimic token-level embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
chunks = text_splitter.split_text(full_document)

# Convert chunks to Document objects
documents = [Document(page_content=chunk) for chunk in chunks]

# Create Chroma vector store with embeddings
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(),  # or any other embedding model
    collection_name="Miyazaki-FastEmbed-TokenLevel"
)

# Perform similarity search
query = "What animation studio did Miyazaki found?"
results = vectorstore.similarity_search(query, k=5)

# Print results
for i, doc in enumerate(results, 1):
    print(f"Result {i}: {doc.page_content}\n")


<a id='part5'></a>
# Advanced Retrieval & Generation


<a id='part5-1'></a>
## Dedicated Re-ranking


In [None]:
# You will need to set your COHERE_API_KEY environment variable
# os.environ['COHERE_API_KEY'] = '<your-cohere-api-key>'

# Load, split, and index the document
loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",))
blog_docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap=50)
splits = text_splitter.split_documents(blog_docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# First-pass retriever: get the top 10 potentially relevant documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

In [None]:
# Initialize the Cohere Rerank model
compressor = CohereRerank(model="rerank-multilingual-v3.0")

# Create the compression retriever
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=retriever
)

# Let's test it with our query
question = "What is task decomposition for LLM agents?"
compressed_docs = compression_retriever.invoke(question)

# Print the re-ranked documents
print("--- Re-ranked and Compressed Documents ---")
for doc in compressed_docs:
    print(f"Relevance Score: {doc.metadata['relevance_score']:.4f}")
    print(f"Content: {doc.page_content[:150]}...\n")

<a id='part6'></a>
# Manual RAG Evaluation



In [None]:
# We'll use a powerful LLM like gpt-4o to act as our "judge" for reliable evaluation.
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)

# Define the output schema for our evaluation score to ensure consistent, structured output.
class ResultScore(BaseModel):
    score: float = Field(..., description="The score of the result, ranging from 0 to 1 where 1 is the best possible score.")

# This prompt template clearly instructs the LLM on how to score the answer's correctness.
correctness_prompt = PromptTemplate(
    input_variables=["question", "ground_truth", "generated_answer"],
    template="""
    Question: {question}
    Ground Truth: {ground_truth}
    Generated Answer: {generated_answer}

    Evaluate the correctness of the generated answer compared to the ground truth.
    Score from 0 to 1, where 1 is perfectly correct and 0 is completely incorrect.
    
    Score:
    """
)

# We build the evaluation chain by piping the prompt to the LLM with structured output.
correctness_chain = correctness_prompt | llm.with_structured_output(ResultScore)

In [None]:
def evaluate_correctness(question, ground_truth, generated_answer):
    """A helper function to run our custom correctness evaluation chain."""
    result = correctness_chain.invoke({
        "question": question, 
        "ground_truth": ground_truth, 
        "generated_answer": generated_answer
    })
    return result.score

# Test the correctness chain with a partially correct answer.
question = "What is the capital of France and Spain?"
ground_truth = "Paris and Madrid"
generated_answer = "Paris"
score = evaluate_correctness(question, ground_truth, generated_answer)

print(f"Correctness Score: {score}")

In [None]:
# The prompt template for faithfulness includes several examples (few-shot prompting)
# to make the instructions to the judge LLM crystal clear.
faithfulness_prompt = PromptTemplate(
    input_variables=["question","context", "generated_answer"],
    template="""
    Question: {question}
    Context: {context}
    Generated Answer: {generated_answer}

    Evaluate if the generated answer to the question can be deduced from the context.
    Score of 0 or 1, where 1 is perfectly faithful *AND CAN BE DERIVED FROM THE CONTEXT* and 0 otherwise.
    You don't mind if the answer is correct; all you care about is if the answer can be deduced from the context.
    
    Example:
    Question: What is the capital of France and Spain?
    Context: Paris is the capital of France and Madrid is the capital of Spain.
    Generated Answer: Paris
    in this case the generated answer is faithful to the context so the score should be *1*.
    
    Example:
    Question: What is 2+2?
    Context: 4.
    Generated Answer: 4.
    In this case, the context states '4', but it does not provide information to deduce the answer to 'What is 2+2?', so the score should be 0.
    """
)

# Build the faithfulness chain using the same structured LLM.
faithfulness_chain = faithfulness_prompt | llm.with_structured_output(ResultScore)

In [None]:
def evaluate_faithfulness(question, context, generated_answer):
    """A helper function to run our custom faithfulness evaluation chain."""
    result = faithfulness_chain.invoke({
        "question": question, 
        "context": context, 
        "generated_answer": generated_answer
    })
    return result.score

# Test the faithfulness chain. The answer is correct, but is it faithful?
question = "what is 3+3?"
context = "6"
generated_answer = "6"
score = evaluate_faithfulness(question, context, generated_answer)

print(f"Faithfulness Score: {score}")

<a id='part7'></a>
# Evaluation with Frameworks


<a id='part7-1'></a>
## Rapid Evaluation with `deepeval`


In [None]:
# Create test cases
test_case_correctness = LLMTestCase(
    input="What is the capital of Spain?",
    expected_output="Madrid is the capital of Spain.",
    actual_output="MadriD."
)

test_case_faithfulness = LLMTestCase(
    input="How much is 3+3?",
    expected_output="6",
    actual_output="6"
)

coherence_metric = GEval(
    name="Match",
    criteria="Check if the model output matches the correct answer exactly.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)

coherence_metric.measure(test_case_correctness)
print(coherence_metric.score)
print(coherence_metric.reason)

coherence_metric.measure(test_case_faithfulness)
print(coherence_metric.score)
print(coherence_metric.reason)


<a id='part7-2'></a>
## Another Powerful Alternative with `grouse`


In [None]:
evaluator = GroundedQAEvaluator()
unfaithful_sample = EvaluationSample(
    input="Where is the Eiffel Tower located?",
    actual_output="The Eiffel Tower is located in Marseille.",
    expected_output="The Eiffel Tower is located at Rue Rabelais in Paris.",
    references=[
        "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France",
        "Gustave Eiffel died in his appartment at Rue Rabelais in Paris."
    ]
)

result = evaluator.evaluate(eval_samples=[unfaithful_sample]).evaluations[0]
print(f"Grouse Faithfulness Score (0 or 1): {result.faithfulness.faithfulness}")

<a id='part7-3'></a>
## Evaluation with `RAGAS`



In [None]:
# 1. Prepare the evaluation data
questions = [
    "What is the name of the three-headed dog guarding the Sorcerer's Stone?",
    "Who gave Harry Potter his first broomstick?",
    "Which house did the Sorting Hat initially consider for Harry?",
]

# These would be the answers generated by our RAG pipeline
generated_answers = [
    "The three-headed dog is named Fluffy.",
    "Professor McGonagall gave Harry his first broomstick, a Nimbus 2000.",
    "The Sorting Hat strongly considered putting Harry in Slytherin.",
]

# The ground truth, or "perfect" answers
ground_truth_answers = [
    "Fluffy",
    "Professor McGonagall",
    "Slytherin",
]

# The context retrieved by our RAG system for each question
retrieved_documents = [
    ["A massive, three-headed dog was guarding a trapdoor. Hagrid mentioned its name was Fluffy."],
    ["First years are not allowed brooms, but Professor McGonagall, head of Gryffindor, made an exception for Harry."],
    ["The Sorting Hat muttered in Harry's ear, 'You could be great, you know, it's all here in your head, and Slytherin will help you on the way to greatness...'"],
]

In [None]:
# 2. Structure the data into a Hugging Face Dataset object
data_samples = {
    'question': questions,
    'answer': generated_answers,
    'contexts': retrieved_documents,
    'ground_truth': ground_truth_answers
}

dataset = Dataset.from_dict(data_samples)

In [None]:
# 3. Define the metrics we want to use for evaluation
metrics = [
    faithfulness,       # How factually consistent is the answer with the context? (Prevents hallucination)
    answer_relevancy,   # How relevant is the answer to the question?
    context_recall,     # Did we retrieve all the necessary context to answer the question?
    answer_correctness, # How accurate is the answer compared to the ground truth?
]

# 4. Run the evaluation
result = evaluate(
    dataset=dataset, 
    metrics=metrics
)

# 5. Display the results in a clean table format
results_df = result.to_pandas()
print(results_df)