# GenAI-Camp: Day 03
## Lesson: RAG Optimization

This lesson is intended to show you how to optimize a RAG system.

During this lesson you will learn how to ...

- use HYDE for generating hypothetical documents to potentially boost the retrieval
- use a reranker to improve RAG performance

### Set up the environment
Import the necessary libraries, set constants, and define helper functions.

In [None]:
from pydantic import BaseModel
import json
from google import genai
from google.genai import types
import os
import requests
import json

In [26]:
if os.getenv("COLAB_RELEASE_TAG"):
   from google.colab import userdata
   GOOGLE_API_KEY=userdata.get('GEMINI_API_KEY')
   COLAB = True
   print("Running on COLAB environment.")
else:
   from dotenv import load_dotenv, find_dotenv
   load_dotenv(find_dotenv())
   GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
   JINA_API_KEY = os.getenv("JINA_API_KEY")
   COLAB = False
   print("WARNING: Running on LOCAL environment.")
client = genai.Client(api_key=GOOGLE_API_KEY)



In [27]:
# Install additional libraries
if COLAB:
  !pip install -qU chromadb
    
# Import additional libraries
from chromadb import PersistentClient

In [28]:
# Define path of ressources
if COLAB:
    # Clone the data repository into colab
    !git clone https://github.com/openknowledge/workshop-genai-camp-data.git
    %cd workshop-genai-camp-data
    !git lfs pull 
    ROOT_PATH = "/content/workshop-genai-camp-data/day-03"
else:
    ROOT_PATH = ".."
DATA_PATH = ROOT_PATH + "/data"
EVALUATION_PATH = ROOT_PATH + "/evaluation"
KNOWLEDGEBASE_PATH = ROOT_PATH + "/knowledgebase"
BOOK_CATALOG_FILE = DATA_PATH + "/books.json"
TESTDATA_FILE = EVALUATION_PATH + "/synthetic_testset.json"

In [29]:
# Set default models
GENERATION_MODEL = "gemini-1.5-flash"
EMBEDDING_MODEL = "models/text-embedding-004"

# Set default values for model, model parameters and prompt
DEFAULT_CONFIG_TEMPERATURE = 0.9 
DEFAULT_CONFIG_TOP_K = 1
DEFAULT_CONFIG_MAX_OUTPUT_TOKENS = 200 
DEFAULT_SYSTEM_PROMPT = "Your are a friendly assistant"
DEFAULT_USER_PROMPT = " "

# Set defaults for rag
DEFAULT_K = 3
DEFAULT_CHUNK_SIZE = 2000
DEFAULT_CHUNK_OVERLAP = 100

In [30]:
def generate_gemini_completion(
        model_name: str = GENERATION_MODEL, 
        temperature:float = DEFAULT_CONFIG_TEMPERATURE,
        top_k: int = DEFAULT_CONFIG_TOP_K, 
        max_output_tokens: int = DEFAULT_CONFIG_MAX_OUTPUT_TOKENS, 
        system_prompt : str = DEFAULT_SYSTEM_PROMPT, 
        user_prompt : str = DEFAULT_USER_PROMPT,
        verbose: bool = False
        ) -> str: 
    
    """ Calls a gemini model with a given set of parameters and returns the completions 
    
    Parameters
    ----------
    model_name : str, optional [default: DEFAULT_GEMINI_MODEL]
        The name of the model to use for the completion
    temperature : float, optional [default: DEFAULT_CONFIG_TEMPERATURE]
        The temperature of the model
    top_k : int, optional [default: DEFAULT_CONFIG_TOP_K]
        The number of most recent matches to return
    max_output_tokens : int, optional [default: DEFAULT_CONFIG_MAX_OUTPUT_TOKENS]
        The maximum number of output tokens to return
    system_prompt : str, optional [default: DEFAULT_SYSTEM_PROMPT]
        The system prompt to use for the completion
    user_prompt : str, optional [default: DEFAULT_USER_PROMPT]
        The user prompt to use for the completion
    verbose : bool, optional [default: False]
        Whether to print details of the completion process or not. Defaults to False            
    Returns 
    -------
    str :
        the generated text      
    """    
    if verbose: 
        # print out summary of input values / parameters
        print(f'Generating answer for following config:')
        print(f'  - SYSTEM PROMPT used:\n {system_prompt}')
        print(f'  - USER PROMPT used:\n {user_prompt}')
        print(f'  - MODEL used:\n {model_name} (temperature = {temperature}, top_k = {top_k}, max_output_tokens = {max_output_tokens})')

    # create generation config 
    model_config = types.GenerateContentConfig(
        max_output_tokens=max_output_tokens,
        temperature=temperature,
        top_k=top_k,
        system_instruction=system_prompt,
    )
    
    # create generation request
    response = client.models.generate_content(
        model=model_name,
        contents=user_prompt,
        config=model_config,
    )
    
    return response.text

In [31]:
def read_objects_from_json(file_path: str, cls: BaseModel) -> list:
    """Reads list of objects from a JSON file and returns the list."""
    with open(file_path, 'r') as file:
        data = json.load(file)
        objects = [cls(**item) for item in data]
    return objects

class Metadata(BaseModel):
    """Represents the metadata of a document which is stored in the knowledgebase."""
    url: str
    title: str
    pub_year: int

class Book(BaseModel):
    """Represents a book with its metadata."""
    metadata: Metadata
    summary: str

class TestdataItem(BaseModel):
    """Represents a test data item with its input, output, and source."""
    input: str
    output: str
    source: Book   

In [32]:
# RAG building blocks

# This will be the chromadb collection we use as a knowledge base. We do not need the client.
chromadb_collection = PersistentClient(path=KNOWLEDGEBASE_PATH).get_or_create_collection(name="default")

class FetchedChunk(BaseModel):
    """Represents a chunk fetched from the knowledgebase."""
    chunk: str
    metadata: Metadata

# Building Block "Embedding": Create multi dimensional embeddings for a given chunk.
def do_embed(chunk: str) -> list[float]:
  """ Embeds a given chunk and returns the embedding

  Parameters
  ----------
  chunk : str
      The chunk to be embedded
  Returns
  -------
  embedding: [float]
      The created embedding
  """
  content_embeddings = client.models.embed_content(model=EMBEDDING_MODEL, contents=chunk).embeddings
  return content_embeddings[0].values

# Building Block "Augmentation": Create an updated prompt by merging the original user input with the provided context
# Attention: We manipulated the augmented prompt in order to see the guardrails in action
def augment(user_input: str, context: list[str]) -> str:
  """ Augments a given user input by merging it with the provided context and returns the augmented prompt

  Parameters
  ----------
  user_input : str
      The user input to be augmented
  context : [str]
      The context to be merged with the user input
  Returns
  -------
  augmented_prompt: str
      The created augmented prompt
  """
  prepared_context = "\n".join(context)
  augmented_prompt = f"""
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n{prepared_context}?\n
    Question: \n{user_input}\n

    Answer:
  """
  return augmented_prompt

# Building Block "Top-k Fetching": Get the k semantically closest chunks to the user input from the knowledgebase
def do_top_k_fetching(user_input_embedding: list[float], top_k: int) -> list[FetchedChunk]:
  """ Fetches the k semantically closest chunks to the user input from the knowledgebase

  Parameters
  ----------
  user_input_embedding : [float]
      The embedding of the user input
  top_k : int
      The number of semantically closest chunks to be fetched

  Returns
  -------
  context: [str]
      The fetched chunks
  """
  # Since we will do the fetching always only for one user_input,
  # instead of querying for multiple embeddings simultanously as allowed by the choma API,
  # we add the embeddings below to a list and return only the first document (chunk)
  
  query_result = chromadb_collection.query(
      query_embeddings=[user_input_embedding],
      n_results=top_k,
  )
  chunks = query_result["documents"][0]
  metadatas = query_result["metadatas"][0]
  
  fetched_chunks = []
  for i in range(len(chunks)):
    chunk = chunks[i]
    metadata = metadatas[i]
    fetched_chunk = FetchedChunk(chunk=chunk, metadata=Metadata(**metadata))
    fetched_chunks.append(fetched_chunk)
  return fetched_chunks

# Building Block "Generation": Use the generation model to create a response
def generate_response(prompt: str) -> str:
  """ Generates a response for a given prompt

  Parameters
  ----------
  prompt : str
      The prompt to be used for the generation
  Returns
  -------
  response: str
      The generated response
  """
  return generate_gemini_completion(
      model_name=GENERATION_MODEL,
      user_prompt=prompt,
  )

# The rag function should now return the response and the context in order to be evaluated further
def do_rag(user_input: str, top_k: int, retrieval_only: bool = False) -> tuple[str, list[str]]:
  """ Runs the RAG pipeline with a given user input and returns the response and the context

  Parameters
  ----------
  user_input : str
      The user input to be used for the RAG pipeline
  Returns
  -------
  response: str
      The generated response
  context: [str]
      The fetched chunks
  """
  # Embed the user input
  user_input_embedding = do_embed(chunk=user_input)

  # "R" like "Retrieval": Get the k semantically closest chunks to the user input from the knowledgebase
  fetched_chunks = do_top_k_fetching(user_input_embedding=user_input_embedding, top_k=top_k)
  context = [chunk.chunk for chunk in fetched_chunks]

  # "A" like "Augmented": Create the augmented prompt
  augmented_prompt = augment(user_input=user_input, context=context)

  # "G" like "Generation": Generate a response
  if not retrieval_only:
    # Generate a response using the augmented prompt
    response = generate_response(prompt=augmented_prompt)
  else:
    response = "No response generated. Only retrieval was requested."

  return (response, fetched_chunks)

In [33]:
def calculate_hit_rate(ground_truth, retrieved: list[str]):
    gt_set = set(ground_truth)
    retrieved_set = set(retrieved)
    return int(bool(gt_set & retrieved_set))

def calculate_reciprocal_rank(ground_truth: list[str], retrieved: list[str]) -> float:
    gt_set = set(ground_truth)
    for rank, item in enumerate(retrieved, start=1):
        if item in gt_set:
            return 1 / rank
    return 0.0
    
def calculate_precision(ground_truth: list[str], retrieved: list[str]) -> float:
    k = len(retrieved)
    gt_set = set(ground_truth)
    retrieved_set = set(retrieved[:k])
    return len(gt_set & retrieved_set) / k

In [34]:
class EvaluationResult(BaseModel):
    """Represents the evaluation result with its metrics."""
    mean_hit_rate: float
    mean_reciprocal_rank: float
    mean_precision: float

def run_evaluation(top_k: int, testdata_items: list[TestdataItem], rag_function: any) -> EvaluationResult:
    hit_rates = []
    reciprocal_ranks = []
    precisions = []

    for testdata_item in testdata_items:

        # Prepare the ground truth
        ground_truth = [testdata_item.source.metadata.title]

        # Run the RAG pipeline
        _, fetched_chunks = rag_function(user_input=testdata_item.input, top_k=top_k, retrieval_only=True)

        # Check if the ground truth is in the context
        retrieved = [item.metadata.title for item in fetched_chunks]

        # Calculate metrics
        # Hit rate
        hit_rate_value = calculate_hit_rate(ground_truth=ground_truth, retrieved=retrieved)
        hit_rates.append(hit_rate_value)
        
        # Reciprocal rank
        reciprocal_rank = calculate_reciprocal_rank(ground_truth=ground_truth, retrieved=retrieved)
        reciprocal_ranks.append(reciprocal_rank)

        # Precision
        precision = calculate_precision(ground_truth=ground_truth, retrieved=retrieved)
        precisions.append(precision)
    
    # Calculate mean values
    mean_hit_rate = sum(hit_rates) / len(hit_rates)
    mean_reciprocal_rank = sum(reciprocal_ranks) / len(reciprocal_ranks)
    mean_precision = sum(precisions) / len(precisions)
    return EvaluationResult(
        mean_hit_rate=mean_hit_rate,
        mean_reciprocal_rank=mean_reciprocal_rank,
        mean_precision=mean_precision
    )

In [35]:
# Read the test data for later evaluating optimization techniques
testdata_items = read_objects_from_json(file_path=TESTDATA_FILE, cls=TestdataItem)

### Exercise 01: Hypothetical Document Embeddings (HyDE)
HyDE is a technique for improving retrieval. To do this, hypothetical documents are generated based on the user's input that could potentially solve the user's task. These generated documents should closely resemble the documents or chunks in the knowledge base. The hypothetical, generated documents are then used for semantic retrieval.  
The underlying idea is that, although the generated documents may not be factually accurate, they are often semantically closer to relevant knowledge base entries than the user's original query. In practice, multiple hypothetical documents are often generated, embedded, and then averaged over each embedding dimension to create a single embedding used for retrieval. This averaging is intended to neutralize incorrect facts while enhancing semantic similarity to the knowledge base.  
Your task is to implement this method. Afterwards, reflect on the potential drawbacks of this approach and evaluate the performance of the resulting system.

In [None]:
HYDE_GENERATION_MODEL = "gemini-2.0-flash-lite"

# TODO: Implement a function which creates hypothetical summaries of books from a given user query
def generate_hypothetical_documents(user_input: str) -> str:
    """ Generates a hypothetical book summary for a given user input
    
    Parameters
    ----------
    user_input : str
        The user input to be used for the generation
    Returns
    -------
    response: str
        The generated hypothetical book summary
    """

    # TODO: Use your promp engineering skills to create a prompt for
    # generating a hypothetical book summary from a given customer query.
    # As an example, you can use a real book summary from the book catalog
    # CUSTOMER: "I’m looking for an old book about a scientist who creates a creature out of dead body parts! The creature comes to life and things go very badly."
    # SUMMARY:"Frankenstein; Or, The Modern Prometheus" by Mary Wollstonecraft Shelley is a novel written in the early 19th century. The story explores themes of ambition, the quest for knowledge, and the consequences of man's hubris through the experiences of Victor Frankenstein and the monstrous creation of his own making.The opening of the book introduces Robert Walton, an ambitious explorer on a quest to discover new lands and knowledge in the icy regions of the Arctic. In his letters to his sister Margaret, he expresses both enthusiasm and the fear of isolation in his grand venture. As Walton's expedition progresses, he encounters a mysterious, emaciated stranger who has faced great suffering—furthering the intrigue of his narrative. This stranger ultimately reveals his tale of creation, loss, and the profound consequences of seeking knowledge that lies beyond human bounds. The narrative is set up in a manner that suggests a deep examination of the emotions and ethical dilemmas faced by those who dare to defy the natural order.
    hyde_prompt = f"""
        You are HYDE, a fictional book summary creator for a bookstore assistant.
        When a customer describes a book they are trying to remember or find, 
        you will generate a summary of the book based on the description provided.
        Use your extensive knowledge of literature to create a plausible summary.

        Below is an example:

        CUSTOMER:
        "I’m looking for an old book about a scientist who creates a creature out of dead body parts! The creature comes to life and things go very badly."

        SUMMARY:
        "Frankenstein; Or, The Modern Prometheus" by Mary Wollstonecraft Shelley is a novel written in the early 19th century. The story explores themes of ambition, the quest for knowledge, and the consequences of man's hubris through the experiences of Victor Frankenstein and the monstrous creation of his own making.
        The opening of the book introduces Robert Walton, an ambitious explorer on a quest to discover new lands and knowledge in the icy regions of the Arctic. In his letters to his sister Margaret, he expresses both enthusiasm and the fear of isolation in his grand venture. As Walton's expedition progresses, he encounters a mysterious, emaciated stranger who has faced great suffering—furthering the intrigue of his narrative. This stranger ultimately reveals his tale of creation, loss, and the profound consequences of seeking knowledge that lies beyond human bounds. The narrative is set up in a manner that suggests a deep examination of the emotions and ethical dilemmas faced by those who dare to defy the natural order.

        Now, based on the following customer inquiry, write a fictional book summary in the same style and length:

        CUSTOMER:
        {user_input}
        
        SUMMARY:
    """
    
    # Generate a hypothetical book summary using the user input as a prompt
    hypothetical_summary = generate_gemini_completion(
        model_name=HYDE_GENERATION_MODEL,
        user_prompt=hyde_prompt,
        temperature=1.0,
    )
    
    return hypothetical_summary

In [None]:
# TODO: Update the following function to use the new generate_hypothetical_documents function
def do_hyde_rag(user_input: str, top_k: int, retrieval_only: bool = False, verbose: bool = False) -> tuple[str, list[str]]:
  """ Runs the RAG pipeline with a given user input and returns the response and the context

  Parameters
  ----------
  user_input : str
      The user input to be used for the RAG pipeline
  Returns
  -------
  response: str
      The generated response
  context: [str]
      The fetched chunks
  """

  # TODO: Generate a hypothetical book summary using the user input as a prompt
  hypothetical_summary = generate_hypothetical_documents(user_input=user_input)

  # TODO: Embed the hypothetical book summaries
  hypothetical_summary_embedding = do_embed(chunk=hypothetical_summary)

  # TODO: "R" like "Retrieval": Get the top-k semantically closest chunks to the hypothetical summary
  fetched_chunks = do_top_k_fetching(user_input_embedding=hypothetical_summary_embedding, top_k=top_k)
  context = [chunk.chunk for chunk in fetched_chunks]

  # "A" like "Augmented": Create the augmented prompt
  augmented_prompt = augment(user_input=user_input, context=context)

  # "G" like "Generation": Generate a response
  if not retrieval_only:
    # Generate a response using the augmented prompt
    response = generate_response(prompt=augmented_prompt)
  else:
    response = "No response generated. Only retrieval was requested."

  if verbose:
    print(f"User input: {user_input}")
    print(f"Generated hypothetical summary: {hypothetical_summary}")
    print(f"Fetched chunks: {fetched_chunks}")
    print(f"Augmented prompt: {augmented_prompt}")
    print(f"Generated response: {response}")
  return (response, fetched_chunks)

In [38]:
# TODO: Test the do_hyde_rag function with a user input
user_prompt = "I'm looking for a book that takes place in London. I think it's about a crime that happened on a foggy night?" 
response, context = do_hyde_rag(user_input=user_prompt, top_k=DEFAULT_K, retrieval_only=True, verbose=True)

User input: I'm looking for a book that takes place in London. I think it's about a crime that happened on a foggy night?
Generated hypothetical summary: "The Hound of the Baskervilles" by Sir Arthur Conan Doyle is a thrilling mystery novel, set in the atmospheric backdrop of the English moors and, specifically, London. The story follows the astute detective Sherlock Holmes and his loyal companion, Dr. Watson, as they investigate a mysterious curse and a series of eerie deaths plaguing the Baskerville family.

The narrative begins with the ominous tale of a spectral hound, said to haunt the Baskerville estate, and the suspicious death of Sir Charles Baskerville. Fearing for the life of the new heir, Sir Henry Baskerville, Holmes and Watson embark on a journey to unravel the truth behind the legend. The investigation takes them to the desolate moors of Devonshire, where they encounter a cast of eccentric characters, including the enigmatic Stapleton and the elusive escaped convict. The 

In [None]:
# TODO: Evaluate the new system. How does it perform compared to the original RAG system? What might be the reasons for the differences?
evaluation_result = run_evaluation(
    top_k=DEFAULT_K,
    testdata_items=testdata_items,
    rag_function=do_hyde_rag
)

print(f"Mean Hit Rate: {evaluation_result.mean_hit_rate:.2f}")
print(f"Mean Reciprocal Rank: {evaluation_result.mean_reciprocal_rank:.2f}")
print(f"Mean Precision: {evaluation_result.mean_precision:.2f}")

EvaluationResult(mean_hit_rate=0.43333333333333335, mean_reciprocal_rank=0.3555555555555555, mean_precision=0.14444444444444443)

### Exercise 02: Reranking
In a RAG system, rerankers play a crucial role in improving the quality of retrieved information before it's passed to the language model. While the retriever fetches potentially relevant documents based on semantic similarity, it often returns noisy or loosely related results. Rerankers evaluate and reorder these documents using more advanced scoring—often based on cross-encoders or task-specific relevance signals—ensuring that the most relevant, contextually appropriate content is prioritized. This leads to more accurate, coherent, and trustworthy generated outputs.  
Your task is to use a reranker from Hugging Face to reorder the retrieved chunks. Then, integrate the reranking step into the existing RAG function.

In [40]:
# If you run this the first time, get your free API-Key at https://jina.ai/reranker/
if COLAB:
    JINA_API_KEY=userdata.get('JINA_API_KEY')

# Here we use the Jina API to rerank the documents
def use_reranker(query:str, documents:list[str], top_n: int) -> list[int]:
  url = "https://api.jina.ai/v1/rerank"
  headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {JINA_API_KEY}"
  }

  payload = {
      "model": "jina-reranker-v2-base-multilingual",
      "query": query,
      "top_n": top_n,
      "documents": documents,
      "return_documents": False
  }

  # Make the request
  response = requests.post(url, headers=headers, data=json.dumps(payload))
  results = json.loads(response.text)["results"]
  return [result["index"] for result in results]

In [41]:
# TODO: Update the reranking function, which uses the CrossEncoder model to rerank the fetched chunks
def rerank(user_input: str, fetched_chunks: list[FetchedChunk]) -> list[FetchedChunk]:
    """ Reranks the fetched chunks based on the user input and returns the reranked chunks

    Parameters
    ----------
    user_input : str
        The user input to be used for the reranking
    fetched_chunks : [FetchedChunk]
        The fetched chunks to be reranked
    Returns
    -------
    reranked_chunks: [FetchedChunk]
        The reranked chunks
    """
    
    # Rank the fetched chunks based on the user input
    documents = [chunk.chunk for chunk in fetched_chunks]
    rankings = use_reranker(
        query=user_input,
        documents=documents,
        top_n=len(documents)
    )

    # TODO: Sort the fetched chunks based on the rankings and return the reranked chunks
    return [fetched_chunks[i] for i in rankings]


In [42]:
# TODO: Update the do_reranked_rag function to use the reranking function
def do_reranked_rag(user_input: str, top_k: int, retrieval_only: bool = False, verbose: bool = False) -> tuple[str, list[str]]:
  """ Runs the RAG pipeline with a given user input and returns the response and the context

  Parameters
  ----------
  user_input : str
      The user input to be used for the RAG pipeline
  Returns
  -------
  response: str
      The generated response
  context: [str]
      The fetched chunks
  """

  # Embed the user input
  user_input_embedding = do_embed(chunk=user_input)

  # "R" like "Retrieval": Get the k semantically closest chunks to the user input from the knowledgebase
  fetched_chunks = do_top_k_fetching(user_input_embedding=user_input_embedding, top_k=top_k)

  # TODO: Rerank the fetched chunks based on the user input
  ranked_fetched_chunks = rerank(user_input=user_input, fetched_chunks=fetched_chunks)
  context = [chunk.chunk for chunk in ranked_fetched_chunks]

  # "A" like "Augmented": Create the augmented prompt
  augmented_prompt = augment(user_input=user_input, context=context)

  # "G" like "Generation": Generate a response
  if not retrieval_only:
    # Generate a response using the augmented prompt
    response = generate_response(prompt=augmented_prompt)
  else:
    response = "No response generated. Only retrieval was requested."

  if verbose:
    print(f"User input: {user_input}")
    print(f"Fetched chunks: {fetched_chunks}")
    print(f"Ranked fetched chunks: {ranked_fetched_chunks}")
    print(f"Augmented prompt: {augmented_prompt}")
    print(f"Generated response: {response}")
  return (response, ranked_fetched_chunks)

In [43]:
# TODO: Evaluate the new system. How does it perform compared to the original RAG system?
evaluation_result = run_evaluation(
    top_k=DEFAULT_K,
    testdata_items=testdata_items,
    rag_function=do_reranked_rag
)
print(f"Mean Hit Rate: {evaluation_result.mean_hit_rate:.2f}")
print(f"Mean Reciprocal Rank: {evaluation_result.mean_reciprocal_rank:.2f}")
print(f"Mean Precision: {evaluation_result.mean_precision:.2f}")

Mean Hit Rate: 0.83
Mean Reciprocal Rank: 0.78
Mean Precision: 0.28


### Exercise 03: Improving Reranked RAG
Using the reranker seems to improve quality. Unfortunately, precision is still very low. Can you think about a way to keep reciprocal rank and hit rate high, while also increasing precision?

In [44]:
# TODO: Improve the rag system even further. How can we improve precision while keeping hit rate and reciprocal rank high?
def do_reranked_rag_v2(user_input: str, top_k: int, retrieval_only: bool = False, verbose: bool = False) -> tuple[str, list[str]]:
  """ Runs the RAG pipeline with a given user input and returns the response and the context

  Parameters
  ----------
  user_input : str
      The user input to be used for the RAG pipeline
  Returns
  -------
  response: str
      The generated response
  context: [str]
      The fetched chunks
  """

  # Embed the user input
  user_input_embedding = do_embed(chunk=user_input)

  # "R" like "Retrieval": Get the k semantically closest chunks to the user input from the knowledgebase
  # TODO: Fetch more chunks than before
  fetched_chunks = do_top_k_fetching(user_input_embedding=user_input_embedding, top_k=15) 

  # TODO: Rerank the fetched chunks and only keep the top_k chunks
  ranked_fetched_chunks = rerank(user_input=user_input, fetched_chunks=fetched_chunks)[:top_k]
  context = [chunk.chunk for chunk in ranked_fetched_chunks]

  # "A" like "Augmented": Create the augmented prompt
  augmented_prompt = augment(user_input=user_input, context=context)

  # "G" like "Generation": Generate a response
  if not retrieval_only:
    # Generate a response using the augmented prompt
    response = generate_response(prompt=augmented_prompt)
  else:
    response = "No response generated. Only retrieval was requested."

  if verbose:
    print(f"User input: {user_input}")
    print(f"Fetched chunks: {fetched_chunks}")
    print(f"Ranked fetched chunks: {ranked_fetched_chunks}")
    print(f"Augmented prompt: {augmented_prompt}")
    print(f"Generated response: {response}")
  return (response, ranked_fetched_chunks)

In [45]:
# TODO: Evaluate the new system. How does it perform compared to the original RAG system?
evaluation_result = run_evaluation(
    top_k=DEFAULT_K,
    testdata_items=testdata_items,
    rag_function=do_reranked_rag_v2
)
print(f"Mean Hit Rate: {evaluation_result.mean_hit_rate:.2f}")
print(f"Mean Reciprocal Rank: {evaluation_result.mean_reciprocal_rank:.2f}")
print(f"Mean Precision: {evaluation_result.mean_precision:.2f}")

Mean Hit Rate: 0.90
Mean Reciprocal Rank: 0.82
Mean Precision: 0.30
