In [None]:
!pip install -q torch transformers langchain langchain-community langchain-huggingface langchain-openai sentence-transformers tqdm openpyxl openai pandas datasets ragatouille faiss-cpu

In [None]:
#Monkey Patching
# ----------------------------------------------------------------------------------------------------------------------------------
import sys
import types
from typing import Optional, List, Tuple
from langchain_core.documents import Document as LangchainDocument

try:
    from langchain_core.documents.compressor import BaseDocumentCompressor

    if "langchain.retrievers" not in sys.modules:
        sys.modules["langchain.retrievers"] = types.ModuleType("langchain.retrievers")

    if "langchain.retrievers.document_compressors" not in sys.modules:
        sys.modules["langchain.retrievers.document_compressors"] = types.ModuleType("langchain.retrievers.document_compressors")

    if "langchain.retrievers.document_compressors.base" not in sys.modules:
        fake_base = types.ModuleType("langchain.retrievers.document_compressors.base")
        fake_base.BaseDocumentCompressor = BaseDocumentCompressor
        sys.modules["langchain.retrievers.document_compressors.base"] = fake_base

except ImportError:
    pass
# ----------------------------------------------------------------------------------------------------------------------------------


In [None]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets

pd.set_option("display.max_colwidth", None)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load Knowledge base

In [None]:
ds = datasets.load_dataset("m-ric/huggingface_doc", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

huggingface_doc.csv:   0%|          | 0.00/22.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2647 [00:00<?, ? examples/s]

# 1. Build a synthetic dataset for evaluation

**1.1. Prepare source documents**

In [None]:
!pip install langchain-text-splitters

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangchainDocument


langchain_docs = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in tqdm(ds)
]


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    separators=["\n\n", "\n", " ", ""]
)

docs_processed = []
for langchain_doc in langchain_docs:
    docs_processed.extend(text_splitter.split_documents([langchain_doc]))

  0%|          | 0/2647 [00:00<?, ?it/s]

**1.2. Setup agents for question generation**

In [None]:
from huggingface_hub import InferenceClient
from google.colab import userdata

# Model repository ID
MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"

# Get the Hugging Face API key from Colab userdata
hf_token = userdata.get('key_hf')

# Initialize the inference client with the API key
llm_client = InferenceClient(model=MODEL_ID, token=hf_token, timeout=120)

# Function to call the language model
def call_llm(client: InferenceClient, prompt: str) -> str:
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1000,   # maximum length of generated text
        temperature=0.7        # creativity vs determinism (lower = more deterministic)
    )
    # The conversational API returns a dictionary, typically with a 'generated_text' key.
    return response.choices[0].message.content

# Example usage
result = call_llm(llm_client, "This is a test context")
print(result)

As a test context, it does not have any specific function or purpose. It appears to be a placeholder or temporary statement used during the development or debugging process to indicate that some code or text is being tested or evaluated. Without further context, it is unclear what is being tested or evaluated. Please provide more information or context for clarification.


In [None]:
QA_generation_prompt = """You are a teacher creating questions for a reading comprehension exam.

Given the following context, create:
- ONE factoid question
- ONE short answer that can be directly inferred from the context

Rules:
- The question must be specific and unambiguous
- The answer must be concise (max 2 sentences)
- Do NOT add explanations
- Use the exact format below

Context:
{context}

Factoid question:
Answer:
"""


In [None]:
import random

N_GENERATIONS = 10  # We intentionally generate only 10 QA couples here for cost and time considerations

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = call_llm(
        llm_client, QA_generation_prompt.format(context=sampled_context.page_content)
    )
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue

Generating 10 QA couples...


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
display(pd.DataFrame(outputs).head(1))

Unnamed: 0,context,question,answer,source_doc
0,in [PR 3011](https://github.com/gradio-app/gradio/pull/3011),"What is the name of the organization that authored the open-source project for building AI models on GPUs in Python that is integrated into NumPy and SciPy and is currently being developed by Intel, NVIDIA, Microsoft, and AWS?\n\nShort answer: Which cloud provider offers a fully managed service for training machine learning models using TensorFlow, Keras, and PyTorch on Amazon SageMaker Notebooks?\n\n",Amazon Web Services (AWS),gradio-app/gradio/blob/main/CHANGELOG.md


**1.3. Setup critique agents**

In [None]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [None]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(
            llm_client,
            question_groundedness_critique_prompt.format(
                context=output["context"], question=output["question"]
            ),
        ),
        "relevance": call_llm(
            llm_client,
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_llm(
            llm_client,
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        continue

Generating critique for each QA couple...


  0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
import pandas as pd

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

# Ensure critique score columns exist, filling with None if missing
# This addresses the KeyError by making sure the columns are present.
required_critique_columns = ["groundedness_score", "relevance_score", "standalone_score"]
for col in required_critique_columns:
    if col not in generated_questions.columns:
        generated_questions[col] = None # Initialize missing columns

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

# Convert score columns to numeric, coercing errors to NaN, before filtering.
# This handles cases where parsing might have failed, resulting in non-numeric values.
for col in required_critique_columns:
    generated_questions[col] = pd.to_numeric(generated_questions[col], errors='coerce')

generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 4)
    & (generated_questions["standalone_score"] >= 4)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(
    generated_questions, split="train", preserve_index=False
)


In [None]:
eval_dataset = datasets.load_dataset("m-ric/huggingface_doc_qa_eval", split="train")

README.md:   0%|          | 0.00/893 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/289k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/65 [00:00<?, ? examples/s]

# 2. BUILD RAG SYSTEM

**2.1. Preprocessing documents to build our vector database**

In [None]:

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in tqdm(ds)
]

  0%|          | 0/2647 [00:00<?, ?it/s]

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: str,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

**2.2. Retriever - embeddings**

In [None]:
!pip install langchain[faiss]

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os


def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "thenlper/gte-small",
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={
            "normalize_embeddings": True
        },  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = (
        f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    )
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

In [None]:
chunks = [
    "The capital of France is great.",
    "The capital of France is huge.",
    "The capital of France is beautiful.",
    """Have you ever visited Paris? It is a beautiful city where you can eat delicious food and see the Eiffel Tower.
    I really enjoyed all the cities in france, but its capital with the Eiffel Tower is my favorite city.""",
    "I really enjoyed my trip to Paris, France. The city is beautiful and the food is delicious. I would love to visit again. Such a great capital city."
]
docs = [LangchainDocument(page_content=sentence) for sentence in chunks]


def compare_rag_techniques(query: str, docs: List[LangchainDocument] = docs) -> None:
    embeddings = embedding_model
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore = FAISS.from_documents(docs, embeddings)

    print("Comparison of Retrieval Techniques")
    print("==================================")
    print(f"Query: {query}\n")

    print("Baseline Retrieval Result:")
    baseline_docs = vectorstore.similarity_search(query, k=2)
    for i, doc in enumerate(baseline_docs):
        print(f"\nDocument {i+1}:")
        print(doc.page_content)

    print("\nAdvanced Retrieval Result:")
    custom_retriever = CustomRetriever(vectorstore=vectorstore)
    advanced_docs = custom_retriever.get_relevant_documents(query)
    for i, doc in enumerate(advanced_docs):
        print(f"\nDocument {i+1}:")
        print(doc.page_content)


query = "what is the capital of france?"
compare_rag_techniques(query, docs)

**2.3. Reader - LLM**

In [None]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [None]:
# from langchain_community.llms import HuggingFaceHub
# from google.colab import userdata

# repo_id = "HuggingFaceH4/zephyr-7b-beta"
# READER_MODEL_NAME = "zephyr-7b-beta"
# HF_API_TOKEN = userdata.get('key_hf') # Retrieve the API token

# READER_LLM = HuggingFaceHub(
#     repo_id=repo_id,
#     task="conversational",
#     huggingfacehub_api_token=HF_API_TOKEN,
#     model_kwargs={
#         "max_new_tokens": 512,
#         "top_k": 30,
#         "temperature": 0.1,
#         "repetition_penalty": 1.03,
#     },
# )

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document as LangchainDocument

def answer_with_rag(
    question: str,
    llm,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(
        query=question, k=num_retrieved_docs
    )
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join(
        [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)]
    )

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    response = llm.invoke(final_prompt)
    answer = response.content if hasattr(response, "content") else response

    return answer, relevant_docs


**3. Benchmarking the RAG system**

In [None]:
def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index: FAISS,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(
            question, llm, knowledge_index, reranker=reranker
        )
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""



In [None]:
# Cách gọi LLM với template này:
# prompt_text = evaluation_prompt_template.format(
#     instruction="Hãy giải thích AI là gì?",
#     response="AI là một công nghệ giả lập trí tuệ con người.",
#     reference_answer="AI là trí tuệ nhân tạo giúp máy tính thực hiện các tác vụ thông minh như con người."
# )
# answer = llm.predict(prompt_text)

In [None]:

def load_and_chunk_docs(
    raw_knowledge_base,
    chunk_size: int,
    chunk_overlap: int = 50,
):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    if isinstance(raw_knowledge_base, list):
        assert all(isinstance(d, LangchainDocument) for d in raw_knowledge_base)
        return splitter.split_documents(raw_knowledge_base)

    raise TypeError(
        "raw_knowledge_base must be list[Document] or path string"
    )


In [None]:


def load_embeddings(
    raw_knowledge_base,
    chunk_size: int,
    embedding_model_name: str,
):
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name
    )

    index_path = f"./faiss_index_{chunk_size}_{embedding_model_name.replace('/', '~')}"

    if os.path.exists(index_path):
        print("Loading existing FAISS index...")
        return FAISS.load_local(
            index_path,
            embedding_model,
            allow_dangerous_deserialization=True,
        )

    print("Building new FAISS index...")
    docs = load_and_chunk_docs(raw_knowledge_base, chunk_size)

    faiss_index = FAISS.from_documents(docs, embedding_model)
    faiss_index.save_local(index_path)

    return faiss_index


In [None]:
from ragatouille import RAGPretrainedModel

In [None]:
from typing import Any, List, Optional
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from huggingface_hub import InferenceClient

class HF_Chat_LLM(LLM):
    """Custom wrapper for Hugging Face Chat Completion API."""

    repo_id: str
    client: Any = None
    temperature: float = 0.1
    max_tokens: int = 512

    def __init__(self, repo_id, **kwargs):
        super().__init__(repo_id=repo_id, **kwargs)
        self.client = InferenceClient(model=repo_id, timeout=120)

    @property
    def _llm_type(self) -> str:
        return "hf_chat_inference"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        messages = [{"role": "user", "content": prompt}]

        response = self.client.chat_completion(
            messages=messages,
            max_tokens=self.max_tokens,
            temperature=self.temperature,
            stop=stop
        )
        return response.choices[0].message.content

repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_LLM = HF_Chat_LLM(
    repo_id=repo_id,
    temperature=0.1,
    max_tokens=512
)

print(READER_LLM.invoke("Hello, who are you?"))

I am not a physical being, but rather a computer program designed to assist and provide information. I do not have a physical presence or personality. I am programmed to respond to your queries and provide helpful and accurate answers based on the information provided to me. I am not capable of having a personal identity or feelings, but I am here to serve as a helpful resource for you in your learning and knowledge-seeking endeavors. My primary function is to answer your questions as accurately and efficiently as possible. I do not have a personal life or experiences beyond what I have been programmed to know, and I do not have beliefs, opinions, or preferences. I exist solely to help you with your inquiries and provide you with useful information.


In [None]:
repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_LLM = HF_Chat_LLM(
    repo_id=repo_id,
    temperature=0.1,
    max_tokens=512
)


In [None]:
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage

In [None]:
evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [None]:
eval_chat_model = ChatOpenAI(
    model="gpt-4.1", # Model's name
    temperature=0,
    openai_api_key=userdata.get('key_ptn'), # PTN's key
    base_url="https://llm.ptnglobalcorp.com"
)
evaluator_name = "GPT4.1"

In [None]:
from langchain_openai import ChatOpenAI
from google.colab import userdata
def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""

    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        with open(answer_path, "r") as f:
            answers = json.load(f)

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)

        feedback, score = [
            item.strip() for item in eval_result.content.split("[RESULT]")
        ]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [None]:

if not os.path.exists("./output"):
    os.mkdir("./output")
READER_MODEL_NAME = "zephyr-7b-beta"

for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG...")
            reranker = (
                RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
                if rerank
                else None
            )
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )


Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Loading existing FAISS index...
Running RAG...


  0%|          | 0/65 [00:00<?, ?it/s]

Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Loading existing FAISS index...
Running RAG...


  0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print("Running evaluation (GPT 4.1 Judge)...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )

Running evaluation (GPT 4.1 Judge)...


  0%|          | 0/65 [00:00<?, ?it/s]

Running evaluation (GPT 4.1 Judge)...


  0%|          | 0/65 [00:00<?, ?it/s]

**Inspect results**

In [None]:
import glob

outputs = []
for file in glob.glob("./output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)

In [None]:
result["eval_score_GPT4.1"] = result["eval_score_GPT4.1"].apply(
    lambda x: int(x) if isinstance(x, str) else 1
)
result["eval_score_GPT4.1"] = (result["eval_score_GPT4.1"] - 1) / 4

In [None]:
average_scores = result.groupby("settings")["eval_score_GPT4.1"].mean()
average_scores.sort_values()

Unnamed: 0_level_0,eval_score_GPT4.1
settings,Unnamed: 1_level_1
./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta.json,0.653846
./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta.json,0.673077


There is no single good recipe: you should try several different directions when tuning your RAG systems.

In [None]:
import plotly.express as px

scores = datasets.load_dataset("m-ric/rag_scores_cookbook", split="train")
scores = pd.Series(scores["score"], index=scores["settings"])

README.md:   0%|          | 0.00/319 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
fig = px.bar(
    scores,
    color=scores,
    labels={
        "value": "Accuracy",
        "settings": "Configuration",
    },
    color_continuous_scale="bluered",
)
fig.update_layout(
    width=1000,
    height=600,
    barmode="group",
    yaxis_range=[0, 100],
    title="<b>Accuracy of different RAG configurations</b>",
    xaxis_title="RAG settings",
    font=dict(size=15),
)
fig.layout.yaxis.ticksuffix = "%"
fig.update_coloraxes(showscale=False)
fig.update_traces(texttemplate="%{y:.1f}", textposition="outside")
fig.show()