In [38]:
!pip install -q torch transformers langchain langchain-community langchain-huggingface langchain-openai sentence-transformers tqdm openpyxl openai pandas datasets ragatouille faiss-cpu

In [39]:
# %reload_ext autoreload
# %autoreload 2

from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
import os

pd.set_option("display.max_colwidth", None)

In [43]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [44]:
ds = datasets.load_dataset("m-ric/huggingface_doc", split="train")

In [45]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangchainDocument

langchain_docs = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in tqdm(ds)
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

  0%|          | 0/2647 [00:00<?, ?it/s]

In [46]:
from huggingface_hub import InferenceClient

repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
)

def call_llm(inference_client: InferenceClient, prompt: str):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = inference_client.chat_completion(
        messages=messages,
        max_tokens=1000,
    )

    return response.choices[0].message.content


print(call_llm(llm_client, "This is a test context"))

 I'd be happy to help answer any questions you might have or assist you with this test context to the best of my abilities. Could you please provide some more information about what you mean by "test context"? Are you referring to a specific testing framework or environment, or do you have a particular problem or scenario in mind that you'd like assistance with? I'll do my best to provide you with accurate and helpful information. Let me know if you have any questions or if there's anything I can do to help.


In [47]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [48]:
import random

N_GENERATIONS = 10  # We intentionally generate only 10 QA couples here for cost and time considerations

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = call_llm(
        llm_client, QA_generation_prompt.format(context=sampled_context.page_content)
    )
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue

Generating 10 QA couples...


  0%|          | 0/10 [00:00<?, ?it/s]

In [49]:
display(pd.DataFrame(outputs).head(1))

Unnamed: 0,context,question,answer,source_doc
0,"`p_hist`: a discrete distribution, which is a quantized version of the text distribution `p_text`.\n \n`q_hist`: same as above, but with `q_text`.\n\n\n### Values from popular papers\n\nThe [original MAUVE paper](https://arxiv.org/abs/2102.01454) reported values ranging from 0.88 to 0.94 for open-ended text generation using a text completion task in the web text domain. The authors found that bigger models resulted in higher MAUVE scores and that MAUVE is correlated with human judgments.\n\n\n## Examples\n\nPerfect match between prediction and reference:\n\n```python\nfrom evaluate import load\nmauve = load('mauve')\npredictions = [""hello world"", ""goodnight moon""]\nreferences = [""hello world"", ""goodnight moon""]\nmauve_results = mauve.compute(predictions=predictions, references=references)\nprint(mauve_results.mauve)\n1.0\n```\n\nPartial match between prediction and reference:\n\n```python\nfrom evaluate import load\nmauve = load('mauve')\npredictions = [""hello world"", ""goodnight moon""]\nreferences = [""hello there"", ""general kenobi""]\nmauve_results = mauve.compute(predictions=predictions, references=references)\nprint(mauve_results.mauve)\n0.27811372536724027\n```\n\n## Limitations and bias\n\nThe [original MAUVE paper](https://arxiv.org/abs/2102.01454) did not analyze the inductive biases present in different embedding models, but related work has shown different kinds of biases exist in many popular generative language models including GPT-2 (see [Kirk et al., 2021](https://arxiv.org/pdf/2102.04130.pdf), [Abid et al., 2021](https://arxiv.org/abs/2101.05783)). The extent to which these biases can impact the MAUVE score has not been quantified.\n\nAlso, calculating the MAUVE metric involves downloading the model from which features are obtained -- the default model, `gpt2-large`, takes over 3GB of storage space and downloading it can take a significant amount of time depending on the speed of your internet connection. If this is an issue, choose a smaller model; for instance, `gpt` is 523MB.","what is the size of the default MAUVE model, `gpt2-large`?\n","The default MAUVE model, `gpt2-large`, takes over 3GB of storage space.",huggingface/evaluate/blob/main/metrics/mauve/README.md


In [50]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [51]:
import re

print("Generating critique for each QA couple...")

for output in tqdm(outputs):
    critique_inputs = {
        "groundedness": question_groundedness_critique_prompt.format(
            context=output["context"], question=output["question"]
        ),
        "relevance": question_relevance_critique_prompt.format(question=output["question"]),
        "standalone": question_standalone_critique_prompt.format(question=output["question"]),
    }

    for criterion, prompt in critique_inputs.items():
        try:
            evaluation = call_llm(llm_client, prompt)

            score_match = re.search(r"Total rating:\s*(\d+)", evaluation, re.IGNORECASE)

            eval_match = re.search(r"Evaluation:\s*(.+?)(?=\nTotal rating:|$)", evaluation, re.IGNORECASE | re.DOTALL)

            if score_match:
                score = int(score_match.group(1))
            else:
                score = None

            if eval_match:
                explanation = eval_match.group(1).strip()
            else:
                explanation = evaluation

            output.update({
                f"{criterion}_score": score,
                f"{criterion}_eval": explanation,
            })

        except Exception as e:
            print(f"Error critiquing {criterion}: {e}")
            output.update({
                f"{criterion}_score": None,
                f"{criterion}_eval": "Error",
            })
if outputs:
    print("Keys in first item:", outputs[0].keys())

Generating critique for each QA couple...


  0%|          | 0/10 [00:00<?, ?it/s]

Keys in first item: dict_keys(['context', 'question', 'answer', 'source_doc', 'groundedness_score', 'groundedness_eval', 'relevance_score', 'relevance_eval', 'standalone_score', 'standalone_eval'])


In [52]:
import pandas as pd

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 4)
    & (generated_questions["standalone_score"] >= 4)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(
    generated_questions, split="train", preserve_index=False
)

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,"what is the size of the default MAUVE model, `gpt2-large`?\n","The default MAUVE model, `gpt2-large`, takes over 3GB of storage space.",1,4,5
1,What is the recommended value for `--snr_gamma` when using Min-SNR weighting strategy for training?\n,The recommended value for `--snr_gamma` when using Min-SNR weighting strategy for training is 5.0.,5,4,5
2,What are the tokenizer files used by the EsperantoDataset class in the provided Python code?\n,"The EsperantoDataset class in the provided Python code uses the following tokenizer files for tokenization: ""./models/EsperBERTo-small/vocab.json"" and ""./models/EsperBERTo-small/merges.txt"".",1,4,3
3,How many digits should numerical values have in doctests for exact matching?\n,Numerical values in doctests should have no more than 4 or 5 digits for exact matching.,5,1,4
4,What is the output directory for the Norwegian T5-base model training?\n,./norwegian-t5-base,3,5,1
5,How many possible states are there in the game environment?\n,There are state\_space possible states. (from the first line of the context),5,1,5
6,What packages should be installed to reproduce the original tokenization process of OpenAI GPT mentioned in the note?\n,`ftfy` and `SpaCy` packages need to be installed.,5,5,4
7,What is the default value for `max_new_tokens` in controlling text generation with the SDXL endpoint?\n,The default value for `max_new_tokens` is 20.,5,4,3
8,Which parameters does GPU0 have access to when processing the mini-batch x0 in Zero-DP?\n,GPU0 has access to parameter a0 in Zero-DP.,5,5,1
9,Which GitHub user contributed to the update of the chatbot conversation nodes to include a copy button?\n,@fazpu,5,1,1


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
1,What is the recommended value for `--snr_gamma` when using Min-SNR weighting strategy for training?\n,The recommended value for `--snr_gamma` when using Min-SNR weighting strategy for training is 5.0.,5,4,5
6,What packages should be installed to reproduce the original tokenization process of OpenAI GPT mentioned in the note?\n,`ftfy` and `SpaCy` packages need to be installed.,5,5,4


In [53]:
eval_dataset = datasets.load_dataset("m-ric/huggingface_doc_qa_eval", split="train")

In [54]:
from langchain_core.documents import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in tqdm(ds)
]

  0%|          | 0/2647 [00:00<?, ?it/s]

In [55]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: str,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [56]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os

def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "thenlper/gte-small",
) -> FAISS:
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={
            "normalize_embeddings": True
        },
    )

    # Check if embeddings already exist on disk
    index_name = (
        f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    )
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

In [57]:
RAG_PROMPT_TEMPLATE = """
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.

Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
"""

In [58]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

repo_id = "mistralai/Mistral-7B-Instruct-v0.2"


llm_endpoint = HuggingFaceEndpoint(
    repo_id=repo_id,
    task="conversational",
    max_new_tokens=512,
    temperature=0.1,
    repetition_penalty=1.03,
)


READER_LLM = ChatHuggingFace(llm=llm_endpoint)


print(READER_LLM.invoke("Hello, who are you?").content)

 I'm an artificial intelligence designed to assist with various tasks and answer questions to the best of my ability. I don't have the ability to have a personality or emotions like a human does. I'm here to help make your life easier and more convenient. How can I assist you today?


In [59]:
#Monkey Patching
# ----------------------------------------------------------------------------------------------------------------------------------
import sys
import types
from typing import Optional, List, Tuple
from langchain_core.documents import Document as LangchainDocument

try:
    from langchain_core.documents.compressor import BaseDocumentCompressor

    if "langchain.retrievers" not in sys.modules:
        sys.modules["langchain.retrievers"] = types.ModuleType("langchain.retrievers")

    if "langchain.retrievers.document_compressors" not in sys.modules:
        sys.modules["langchain.retrievers.document_compressors"] = types.ModuleType("langchain.retrievers.document_compressors")

    if "langchain.retrievers.document_compressors.base" not in sys.modules:
        fake_base = types.ModuleType("langchain.retrievers.document_compressors.base")
        fake_base.BaseDocumentCompressor = BaseDocumentCompressor
        sys.modules["langchain.retrievers.document_compressors.base"] = fake_base

except ImportError:
    pass
# ----------------------------------------------------------------------------------------------------------------------------------


from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM

def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(
        query=question, k=num_retrieved_docs
    )
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join(
        [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)]
    )

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    result = llm.invoke(final_prompt)
    answer = result.content

    return answer, relevant_docs

In [60]:
from langchain_core.language_models import BaseChatModel

def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(
            question, llm, knowledge_index, reranker=reranker
        )
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [61]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [64]:
from langchain_openai import ChatOpenAI
from google.colab import userdata
eval_chat_model = ChatOpenAI(
    model="gpt-4.1",
    temperature=0,
    openai_api_key=userdata.get("OPENAI_API_KEY"),
    base_url="https://llm.ptnglobalcorp.com/"
)
evaluator_name = "GPT4"
# eval_chat_model = READER_LLM
# evaluator_name = "Mistral-7B-Judge"
def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        with open(answer_path, "r") as f:
            answers = json.load(f)

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        content = eval_result.content

        if "[RESULT]" in content:
            parts = content.split("[RESULT]")
            feedback = parts[0].strip()
            score = parts[1].strip()
        else:

            feedback = content.strip()
            score = "0"

        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [65]:
if not os.path.exists("./output"):
    os.mkdir("./output")

READER_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME.replace('/', '~')}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG...")
            reranker = (
                RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
                if rerank
                else None
            )

            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            print("Running evaluation (GPT-4 Judge)...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )

Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:mistralai~Mistral-7B-Instruct-v0.2:
Loading knowledge base embeddings...
Running RAG...


  0%|          | 0/65 [00:00<?, ?it/s]

Running evaluation (GPT-4 Judge)...


  0%|          | 0/65 [00:00<?, ?it/s]

Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:mistralai~Mistral-7B-Instruct-v0.2:
Loading knowledge base embeddings...
Running RAG...


  0%|          | 0/65 [00:00<?, ?it/s]

Running evaluation (GPT-4 Judge)...


  0%|          | 0/65 [00:00<?, ?it/s]

In [66]:
JUDGE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"

judge_llm_endpoint = HuggingFaceEndpoint(
    repo_id=JUDGE_REPO_ID,
    task="conversational",
    max_new_tokens=1024,
    temperature=0.01,

)

eval_chat_model = ChatHuggingFace(llm=judge_llm_endpoint)

evaluator_name = "Mistral-7B-v0.2-Judge"

print(f"Judge Model Initialized: {JUDGE_REPO_ID}")

Judge Model Initialized: mistralai/Mistral-7B-Instruct-v0.2


In [67]:
def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place."""
    answers = []
    if os.path.isfile(answer_path):
        with open(answer_path, "r") as f:
            answers = json.load(f)

    print(f"tarting evaluation with {evaluator_name}...")

    # Filter out already evaluated experiments
    experiments_to_run = [exp for exp in answers if f"eval_score_{evaluator_name}" not in exp]

    for i, experiment in enumerate(tqdm(answers)):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )

        try:
            # Invoke the model
            eval_result = eval_chat_model.invoke(eval_prompt)
            result_text = eval_result.content

            score = "1" # Default score
            feedback = result_text.strip()

            # Method 1: Look for explicit [RESULT] tag
            if "[RESULT]" in result_text:
                parts = result_text.split("[RESULT]")
                feedback = parts[0].strip()
                score_part = parts[1].strip()
                # Extract first digit found in the score part
                match = re.search(r'\d+', score_part)
                if match:
                    score = match.group(0)

            # Method 2: Fallback - Look for "Rating: X" or just the number at the end
            else:
                # Look for pattern like "Score: 5" or "Rating: 5"
                match = re.search(r'(?:Rating|Score|Result):\s*(\d+)', result_text, re.IGNORECASE)
                if match:
                    score = match.group(1)
                else:
                    # Last resort: find the last digit in the text (often the score)
                    digits = re.findall(r'\b[1-5]\b', result_text)
                    if digits:
                        score = digits[-1]

            # Update Experiment
            experiment[f"eval_score_{evaluator_name}"] = score
            experiment[f"eval_feedback_{evaluator_name}"] = feedback

            # Save immediately (Checkpointing)
            with open(answer_path, "w") as f:
                json.dump(answers, f)

        except Exception as e:
            print(f"Error processing sample {i}: {e}")
            continue

    print("Evaluation complete.")

# 4. Main Execution Loop
if not os.path.exists("./output"):
    os.mkdir("./output")

READER_MODEL_NAME = "zephyr-7b-beta"

for chunk_size in [200]:
    for embeddings in ["thenlper/gte-small"]:
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME.replace('/', '~')}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"--------------------------------------------------")
            print(f"Processing Settings: {settings_name}")

            # Ensure knowledge_index is loaded (assuming load_embeddings is defined previously)
            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG generation...")
            reranker = (
                RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
                if rerank
                else None
            )

            # Generate Answers
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            # Evaluate Answers using Gemma
            print(f"Running evaluation ({evaluator_name})...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )

--------------------------------------------------
Processing Settings: chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta
Loading knowledge base embeddings...
Running RAG generation...


  0%|          | 0/65 [00:00<?, ?it/s]

  return torch.cuda.amp.autocast() if self.activated else NullContextManager()

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.47it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.50it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.46it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.68it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.47it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.30it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.33it/s]

100%|██████████| 1/1 [00:00<00:00, 10.64it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.37it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.34it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.30

Running evaluation (Mistral-7B-v0.2-Judge)...
tarting evaluation with Mistral-7B-v0.2-Judge...


  0%|          | 0/65 [00:00<?, ?it/s]

Error processing sample 18: 402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-6970677d-3929ef0c2d1edee9273190e0;e57ae0fa-d2ad-4f7f-a74e-1b63faad80ec)

You have reached the free monthly usage limit for featherless-ai. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account.
Error processing sample 19: 402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-6970677e-617f181e6595449d2736a3a8;70ed35a7-8b7a-4797-af7b-766c113815ed)

You have reached the free monthly usage limit for featherless-ai. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account.
Error processing sample 20: 402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-6970677e-14679b5d4423f87c520e4997;5b956928-f8bb-47cf-be41-62e92a8961c3)

  0%|          | 0/65 [00:00<?, ?it/s]

Running evaluation (Mistral-7B-v0.2-Judge)...
tarting evaluation with Mistral-7B-v0.2-Judge...


  0%|          | 0/65 [00:00<?, ?it/s]

Evaluation complete.


In [68]:
import glob, json, pandas as pd

result = pd.concat([
    pd.DataFrame(json.load(open(f, "r"))).assign(settings=f.split('/')[-1].replace('.json', ''))
    for f in glob.glob("./output/*.json")
])

score_col = next(col for col in result.columns if col.startswith("eval_score_"))

result[score_col] = pd.to_numeric(result[score_col], errors='coerce').fillna(1)
result["accuracy"] = (result[score_col] - 1) / 4

average_scores = result.groupby("settings")["accuracy"].mean().sort_values(ascending=False)
display(average_scores)

Unnamed: 0_level_0,accuracy
settings,Unnamed: 1_level_1
rag_chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:mistralai~Mistral-7B-Instruct-v0.2,0.880769
rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:mistralai~Mistral-7B-Instruct-v0.2,0.838462
rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta,0.0
rag_chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta,0.0


In [69]:
import plotly.express as px

scores = datasets.load_dataset("m-ric/rag_scores_cookbook", split="train")
scores = pd.Series(scores["score"], index=scores["settings"])

README.md:   0%|          | 0.00/319 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [70]:
fig = px.bar(
    scores,
    color=scores,
    labels={
        "value": "Accuracy",
        "settings": "Configuration",
    },
    color_continuous_scale="bluered",
)
fig.update_layout(
    width=1000,
    height=600,
    barmode="group",
    yaxis_range=[0, 100],
    title="<b>Accuracy of different RAG configurations</b>",
    xaxis_title="RAG settings",
    font=dict(size=15),
)
fig.layout.yaxis.ticksuffix = "%"
fig.update_coloraxes(showscale=False)
fig.update_traces(texttemplate="%{y:.1f}", textposition="outside")
fig.show()