## Objective: To build a simple RAG evaluation framework

### Part 2: Build & Benchmark a RAG System using an already synthesized evaluation dataset (in Part 1)

#### Part 1: Synthesize and filter an Instruction dataset from a custom knowledge-base (See https://lightning.ai/panchamsnotes/studios/evaluate-your-rag-part-1-synthesize-an-evaluation-dataset?view=public&section=featured)



#### Primary reference: https://huggingface.co/learn/cookbook/en/rag_evaluation by Aymeric Roucher (https://huggingface.co/m-ric)

For the knowledge base, let us use the  litgpt Github repo: https://github.com/Lightning-AI/litgpt/tree/main

### LLM Reader retrieves relevant documents to formulate response

### Installs and Dependencies

In [1]:
%pip install -q torch transformers transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets
%pip install -U --quiet langchain langsmith langchainhub langchain_benchmarks langchain-openai Gitpython python-dotenv RAGatouille
%pip install --quiet chromadb openai huggingface pandas langchain_experimental sentence_transformers pyarrow anthropic tiktoken

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai_api_key = os.environ['OPENAI_API_KEY'] 
hf_api_key = os.environ['HF_API_KEY'] 

In [15]:
import textwrap
from tqdm import tqdm
import pandas as pd
import json
import datasets
import random
import glob
from typing import Optional, List, Tuple

pd.set_option("display.max_colwidth", None)

from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import GitLoader
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document as LangchainDocument
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.llms import HuggingFaceHub
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM
from langchain_core.language_models import BaseChatModel

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage

from transformers import AutoTokenizer, AutoModelForCausalLM

from ragatouille import RAGPretrainedModel

import warnings
warnings.filterwarnings('ignore')

## Build the RAG System

### Pre-processing documents to build the knowledge base

In [4]:
loader = GitLoader(
    clone_url="https://github.com/Lightning-AI/litgpt",
    repo_path="./litgpt_data_github/",
    branch="main",
    file_filter=lambda file_path: file_path.endswith(".md") # Only get the markdown files
)

data = loader.load()

RAW_KNOWLEDGE_BASE = data

In [5]:
def split_documents_into_chunks(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: str,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", "", "\n\n\n"],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

### Create the retriever after building a vector index using FAISS

In [6]:
def create_vector_index(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "thenlper/gte-small",
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={
            "normalize_embeddings": True
        },  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = (
        f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    )
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents_into_chunks(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

In [7]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [8]:

repo_id = "HuggingFaceH4/zephyr-7b-beta" 
READER_MODEL_NAME = "zephyr-7b-beta"

READER_LLM = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
    huggingfacehub_api_token=hf_api_key
)

In [9]:
def get_rag_response(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(
        query=question, k=num_retrieved_docs
    )
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join(
        [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)]
    )

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)

    return answer, relevant_docs

## Benchmark the RAG System

### Get eval dataset synthesized in Part 1
 (https://lightning.ai/panchamsnotes/studios/evaluate-your-rag-part-1-synthesize-an-evaluation-dataset?section=featured)

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
### You can load it using the following;::

eval_dataset = datasets.load_dataset("delayedkarma/litgpt_instruction_qa", split="train")

In [12]:
def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm: BaseChatModel,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = get_rag_response(
            question, llm, knowledge_index, reranker=reranker
        )
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

### Define evaluation prompt

In [13]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

### Define evaluator models 

In [27]:
eval_chat_model_gpt4_1106 = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
evaluator_name_gpt4_1106 = "GPT4_1106"

eval_chat_model_gpt4_0125 = ChatOpenAI(model="gpt-4-0125-preview", temperature=0)
evaluator_name_gpt4_0125 = "GPT4_0125"


def evaluate_rag_responses(
    answer_path: str,
    eval_chat_model: BaseChatModel,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [
            item.strip() for item in eval_result.content.split("[RESULT]")
        ]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

### Run the tests and evaluate the responses

In [28]:
if not os.path.exists("./output"):
    os.mkdir("./output")

for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small", "BAAI/bge-small-en-v1.5"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = create_vector_index(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings
            )

            print("Running RAG...")
            reranker = (
                RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
                if rerank
                else None
            )
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            print("Running evaluation for gpt-4-0125-preview ...")
            print()
            evaluate_rag_responses(
                output_file_name,
                eval_chat_model_gpt4_0125,
                evaluator_name_gpt4_0125,
                evaluation_prompt_template,
            )

            print("Running evaluation for gpt-4-1106-preview ...")
            print()
            evaluate_rag_responses(
                output_file_name,
                eval_chat_model_gpt4_1106,
                evaluator_name_gpt4_1106,
                evaluation_prompt_template,
            )


Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...


Running RAG...


100%|██████████| 30/30 [00:00<00:00, 11912.25it/s]


Running evaluation for gpt-4-0125-preview ...



100%|██████████| 30/30 [02:06<00:00,  4.23s/it]


Running evaluation for gpt-4-1106-preview...



100%|██████████| 30/30 [02:15<00:00,  4.53s/it]


Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 30/30 [00:00<00:00, 11516.49it/s]


Running evaluation for gpt-4-0125-preview ...



100%|██████████| 30/30 [02:06<00:00,  4.22s/it]


Running evaluation for gpt-4-1106-preview...



100%|██████████| 30/30 [02:12<00:00,  4.42s/it]


Running evaluation for chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:True_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...


  0%|          | 0/30 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 22.87it/s]
 17%|█▋        | 5/30 [00:06<00:31,  1.25s/it]
100%|██████████| 1/1 [00:00<00:00, 22.28it/s]
 20%|██        | 6/30 [00:13<01:02,  2.60s/it]
100%|██████████| 1/1 [00:00<00:00, 22.79it/s]
 23%|██▎       | 7/30 [00:18<01:11,  3.11s/it]
100%|██████████| 1/1 [00:00<00:00, 22.60it/s]
 27%|██▋       | 8/30 [00:29<01:53,  5.16s/it]
100%|██████████| 1/1 [00:00<00:00, 22.75it/s]
 30%|███       | 9/30 [00:33<01:44,  4.97s/it]
100%|██████████| 1/1 [00:00<00:00, 22.27it/s]
 33%|███▎      | 10/30 [00:44<02:09,  6.49s/it]
100%|██████████| 1/1 [00:00<00:00, 22.30it/s]
 37%|███▋      | 11/30 [00:52<02:10,  6.87s/it]
100%|██████████| 1/1 [00:00<00:00, 22.40it/s]
 40%|████      | 12/30 [01:08<02:52,  9.61s/it]
100%|██████████| 1/1 [00:00<00:00, 22.65it/s]
 43%|████▎     | 13/30 [01:16<02:35,  9.15s/it]
100%|██████████| 1/1 [00:00<00:00, 23.08it/s]
 47%|████▋     | 14/30 [01:26<02:29,  9.36s/it]
100%|██████████| 1/1 [00:0

Running evaluation for gpt-4-0125-preview ...



100%|██████████| 30/30 [02:18<00:00,  4.63s/it]


Running evaluation for gpt-4-1106-preview...



100%|██████████| 30/30 [02:08<00:00,  4.27s/it]


Running evaluation for chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:False_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 30/30 [04:25<00:00,  8.86s/it]


Running evaluation for gpt-4-0125-preview ...



100%|██████████| 30/30 [02:09<00:00,  4.32s/it]


Running evaluation for gpt-4-1106-preview...



100%|██████████| 30/30 [01:58<00:00,  3.95s/it]


### Inspect the results

In [37]:
outputs = []

for file in glob.glob("./output/*.json"):
    print(file)
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)

./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta.json
./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta.json
./output/rag_chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:True_reader-model:zephyr-7b-beta.json
./output/rag_chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:False_reader-model:zephyr-7b-beta.json


In [38]:
# result.drop(['eval_score_GPT35', 'eval_feedback_GPT35','eval_score_GPT4', 'eval_feedback_GPT4'], axis=1, inplace=True) # artifacts from previous run
result.columns

Index(['question', 'true_answer', 'source_doc', 'generated_answer',
       'retrieved_docs', 'test_settings', 'eval_score_GPT4_0125',
       'eval_feedback_GPT4_0125', 'eval_score_GPT4_1106',
       'eval_feedback_GPT4_1106', 'settings'],
      dtype='object')

In [39]:
result.head(2)

Unnamed: 0,question,true_answer,source_doc,generated_answer,retrieved_docs,test_settings,eval_score_GPT4_0125,eval_feedback_GPT4_0125,eval_score_GPT4_1106,eval_feedback_GPT4_1106,settings
0,What is the memory usage of Llama 2 with 7B when using bnb.nf4-dq?\n,13.84 GB,tutorials/resource-tables.md,"\n<|system|>\nUsing the information contained in the context,\ngive a comprehensive answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nProvide the number of the source document when relevant.\nIf the answer cannot be deduced from the context, do not give an answer.</s>\n<|user|>\nContext:\n\nExtracted documents:\nDocument 0:::\n| | | | | | | |\n| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min |\n| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min |\n| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min |\n| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - |\n| | | | | | | |Document 1:::\n| | | | | | |\n| 7 B | Llama 2 | None | 1 x A100 | 13.52 GB | 30.97 |\n| 7 B | Llama 2 | bnb.nf4 | 1 x A100 | 4.57 GB | 19.98 |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 4.26 GB | 17.3 |\n| | | | | | |\n| 13 B | Llama 2 | None | 1 x A100 | 26.21 GB | 24.82 |\n| 13 B | Llama 2 | bnb.nf4 | 1 x A100 | 8.32 GB | 16.73 |Document 2:::\n| | | | | | | |\n| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min |\n| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min |\n| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min |\n| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - |\n| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - |\n| | | | | | | |Document 3:::\n| 13 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 7.72 GB | 14.43 |\n| | | | | | |\n| 34 B | CodeLlama | None | 1 x A100 | OOM | - |\n| 34 B | CodeLlama | bnb.nf4 | 1 x A100 | 20.52 GB | 14.32 |\n| 34 B | CodeLlama | bnb.nf4-dq | 1 x A100 | 18.95 GB | 12.37 |\n| | | | | | |\n| 40 B | Falcon | None | 1 x A100 | OOM | - |\n| 40 B | Falcon | bnb.nf4 | 1 x A100 | 26.55 GB | 13.25 |Document 4:::\n| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - |\n| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - |\n| | | | | | | | |\n| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min |\n| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - |Document 5:::\n| 3 B | StableLM Alpha | bnb.nf4 | 1 | 2,125,248 | 7.41 GB | 1.59 min |\n| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 2,125,248 | 7.25 GB | 1.62 min |\n| | | | | | | |\n| 7 B | Llama 2 | None | 1 | 4,279,744 | 25.51 GB | 1.81 min |\n| 7 B | Llama 2 | bnb.nf4 | 1 | 4,279,744 | 18.30 GB | 3.23 min |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,279,744 | 17.98 GB | 3.32 min |Document 6:::\n| 3 B | StableLM Alpha | bnb.nf4 | 1 | 573,888 | 5.65 GB | 1.38 min |\n| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 573,888 | 5.48 GB | 1.46 min |\n| | | | | | | |\n| 7 B | Llama 2 | None | 1 | 1,229,760 | 19.98 GB | 1.50 min |\n| 7 B | Llama 2 | bnb.nf4 | 1 | 1,229,760 | 12.68 GB | 2.93 min |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 | 1,229,760 | 12.38 GB | 3.00 min |\n---\nNow here is the question you need to answer.\n\nQuestion: What is the memory usage of Llama 2 with 7B when using bnb.nf4-dq?\n\n</s>\n<|assistant|>\nThe memory usage of Llama 2 with 7B when using bnb.nf4-dq is 4,194,304 bytes, as shown in Document 0, row 3. However, in Document 1, row 3, it is also listed as 4,26 GB, which is equivalent to 36,864,000,000 bytes. This discrepancy may be due to differences in the way memory is reported or measured in these documents. In either case, the memory usage is significantly lower than when using bnb.nf4 (14.14 GB) or None (29.07 GB) in Document 0.","[| | | | | | | |\n| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min |\n| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min |\n| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min |\n| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - |\n| | | | | | | |, | | | | | | |\n| 7 B | Llama 2 | None | 1 x A100 | 13.52 GB | 30.97 |\n| 7 B | Llama 2 | bnb.nf4 | 1 x A100 | 4.57 GB | 19.98 |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 4.26 GB | 17.3 |\n| | | | | | |\n| 13 B | Llama 2 | None | 1 x A100 | 26.21 GB | 24.82 |\n| 13 B | Llama 2 | bnb.nf4 | 1 x A100 | 8.32 GB | 16.73 |, | | | | | | | |\n| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min |\n| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min |\n| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min |\n| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - |\n| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - |\n| | | | | | | |, | 13 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 7.72 GB | 14.43 |\n| | | | | | |\n| 34 B | CodeLlama | None | 1 x A100 | OOM | - |\n| 34 B | CodeLlama | bnb.nf4 | 1 x A100 | 20.52 GB | 14.32 |\n| 34 B | CodeLlama | bnb.nf4-dq | 1 x A100 | 18.95 GB | 12.37 |\n| | | | | | |\n| 40 B | Falcon | None | 1 x A100 | OOM | - |\n| 40 B | Falcon | bnb.nf4 | 1 x A100 | 26.55 GB | 13.25 |, | 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - |\n| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - |\n| | | | | | | | |\n| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min |\n| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - |, | 3 B | StableLM Alpha | bnb.nf4 | 1 | 2,125,248 | 7.41 GB | 1.59 min |\n| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 2,125,248 | 7.25 GB | 1.62 min |\n| | | | | | | |\n| 7 B | Llama 2 | None | 1 | 4,279,744 | 25.51 GB | 1.81 min |\n| 7 B | Llama 2 | bnb.nf4 | 1 | 4,279,744 | 18.30 GB | 3.23 min |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,279,744 | 17.98 GB | 3.32 min |, | 3 B | StableLM Alpha | bnb.nf4 | 1 | 573,888 | 5.65 GB | 1.38 min |\n| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 573,888 | 5.48 GB | 1.46 min |\n| | | | | | | |\n| 7 B | Llama 2 | None | 1 | 1,229,760 | 19.98 GB | 1.50 min |\n| 7 B | Llama 2 | bnb.nf4 | 1 | 1,229,760 | 12.68 GB | 2.93 min |\n| 7 B | Llama 2 | bnb.nf4-dq | 1 | 1,229,760 | 12.38 GB | 3.00 min |]",chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta,1,"Feedback: The response provided incorrect information by stating the memory usage in bytes and then incorrectly converting those bytes into gigabytes, leading to a significant discrepancy. The correct answer, as per the reference answer, is 13.84 GB, which is directly mentioned in the provided context (Document 0, row 3). The response failed to accurately report this figure and instead introduced confusion with incorrect data conversion. This deviation from the factual content in the context and the reference answer indicates a lack of accuracy in the response.",1,"The response provided is incorrect and not factual as per the reference answer. The response cites two different memory usages, 4,194,304 bytes and 4.26 GB, which are both incorrect and do not match the reference answer of 13.84 GB. The correct memory usage is clearly stated in the context provided in Document 0, row 3, which should have been the sole focus of the answer. The response also introduces unnecessary confusion by mentioning a discrepancy without clarifying that the correct answer is 13.84 GB as per the reference answer. Therefore, the response does not meet the criteria for accuracy and factuality.",./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta.json
1,What is the command to run the evaluation harness?\n,"The command to run the evaluation harness is `lm_eval --model hf --model_args pretrained=out/hf-tinyllama/converted --tasks ""hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge"" --device ""cuda:0"" --batch_size 4`.",tutorials/convert_lit_models.md,"\n<|system|>\nUsing the information contained in the context,\ngive a comprehensive answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nProvide the number of the source document when relevant.\nIf the answer cannot be deduced from the context, do not give an answer.</s>\n<|user|>\nContext:\n\nExtracted documents:\nDocument 0:::\nThen, we can run the Evaluation Harness as follows:\n\n```bash\nlm_eval --model hf \\n --model_args pretrained=""out/converted_model"" \\n --tasks ""hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge"" \\n --device ""cuda:0"" \\n --batch_size 4\n```\n\n&nbsp;\n\n> [!TIP]\n> The Evaluation Harness tasks above are those used in Open LLM Leaderboard. You can find a list all supported tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).Document 1:::\n4. Run the evaluation harness, for example:\n\n```bash\nlm_eval --model hf \\n --model_args pretrained=out/hf-tinyllama/converted \\n --tasks ""hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge"" \\n --device ""cuda:0"" \\n --batch_size 4\n```Document 2:::\n# LLM Evaluation\n\n&nbsp;\n\n## Using lm-evaluation-harness\n\nYou can evaluate LitGPT using [EleutherAI's lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) framework with a large number of different evaluation tasks.\n\nYou need to install the `lm-eval` framework first:\n\n```bash\npip install lm_eval\n```\n\n&nbsp;\n\n### Evaluating LitGPT base models\n\nSuppose you downloaded a base model that we want to evaluate. Here, we use the `microsoft/phi-2` model:\n\n```bash\nlitgpt download --repo_id microsoft/phi-2\n```Document 3:::\nYou can then use the model with external tools, for example, Eleuther AI's [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) (see the `lm_eval` installation instructions [here](https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#install)).\n\nThe LM Evaluation Harness requires a tokenizer to be present in the model checkpoint folder, which we can copy from the original download checkpoint:\n\n```bash\n# Copy the tokenizer needed by the Eval Harness\ncp checkpoints/microsoft/phi-2/tokenizer*\nout/converted_model\n```\n\nThen, we can run the Evaluation Harness as follows:Document 4:::\n&nbsp;\n## Evaluating models\n\nLitGPT comes with a handy `litgpt evaluate` command to evaluate models with [Eleuther AI's Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness). For example, to evaluate the previously downloaded `microsoft/phi-2` model on several tasks available from the Evaluation Harness, you can use the following command:\n\n```bash\nlitgpt evaluate \\n --checkpoint_dir checkpoints/microsoft/phi-2\n --batch_size 16 \\n --tasks ""hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge""\n```Document 5:::\nPlease note that the `litgpt evaluate` command run an internal model conversion. \nThis is only necessary the first time you want to evaluate a model, and it will skip the\nconversion steps if you run the `litgpt evaluate` on the same checkpint directory again.\n\nIn some cases, for example, if you modified the model in the `checkpoint_dir` since the first `litgpt evaluate`\ncall, you need to use the `--force_conversion` flag to to update the files used by litgpt evaluate accordingly: \n\n```\nlitgpt evaluate \\n --checkpoint_dir checkpoints/microsoft/phi-2/ \\n --batch_size 4 \\n --out_dir evaluate_model/ \\n --tasks ""hellaswag,truthfulqa_mc2,mmlu"" \\n --force_conversion true\n```Document 6:::\n(A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).)\n\n\n&nbsp;\n## Deploy LLMs\n\nYou can deploy LitGPT LLMs using your tool of choice. Below is an example using LitGPT built-in serving capabilities:\n\n\n```bash\n# 1) Download a pretrained model (alternatively, use your own finetuned model)\nlitgpt download --repo_id microsoft/phi-2\n\n# 2) Start the server\nlitgpt serve --checkpoint_dir checkpoints/microsoft/phi-2\n```\n---\nNow here is the question you need to answer.\n\nQuestion: What is the command to run the evaluation harness?\n\n</s>\n<|assistant|>\nThe command to run the evaluation harness using LitGPT is:\n\n```bash\nlitgpt evaluate \\n --checkpoint_dir checkpoints/[MODEL_NAME] \\n --batch_size [BATCH_SIZE] \\n --tasks ""[TASK_1],[TASK_2],[TASK_3]"" \\n [OPTIONAL_FLAGS]\n```\n\nReplace `[MODEL_NAME]` with the name of the pretrained model you want to evaluate, `[BATCH_SIZE]` with the desired batch size, `[TASK_1],[TASK_2],[TASK_3]` with the names of the evaluation tasks you want to run (separated by commas), and `[OPTIONAL_FLAGS]` with any additional flags you want to pass to the `litgpt evaluate` command, such as `--out_dir` to specify an output directory or `--force_conversion` to force model conversion.\n\nNote that the `litgpt evaluate` command automatically converts the model for use with the Evaluation Harness if it hasn't been done before. If you modify the model in the `checkpoint_dir` since the last `litgpt evaluate` call, you can use the `--force_conversion` flag to update the files used by `litgpt evaluate`.\n\nA list of supported evaluation tasks can be found at https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md.","[Then, we can run the Evaluation Harness as follows:\n\n```bash\nlm_eval --model hf \\n --model_args pretrained=""out/converted_model"" \\n --tasks ""hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge"" \\n --device ""cuda:0"" \\n --batch_size 4\n```\n\n&nbsp;\n\n> [!TIP]\n> The Evaluation Harness tasks above are those used in Open LLM Leaderboard. You can find a list all supported tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md)., 4. Run the evaluation harness, for example:\n\n```bash\nlm_eval --model hf \\n --model_args pretrained=out/hf-tinyllama/converted \\n --tasks ""hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge"" \\n --device ""cuda:0"" \\n --batch_size 4\n```, # LLM Evaluation\n\n&nbsp;\n\n## Using lm-evaluation-harness\n\nYou can evaluate LitGPT using [EleutherAI's lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) framework with a large number of different evaluation tasks.\n\nYou need to install the `lm-eval` framework first:\n\n```bash\npip install lm_eval\n```\n\n&nbsp;\n\n### Evaluating LitGPT base models\n\nSuppose you downloaded a base model that we want to evaluate. Here, we use the `microsoft/phi-2` model:\n\n```bash\nlitgpt download --repo_id microsoft/phi-2\n```, You can then use the model with external tools, for example, Eleuther AI's [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) (see the `lm_eval` installation instructions [here](https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#install)).\n\nThe LM Evaluation Harness requires a tokenizer to be present in the model checkpoint folder, which we can copy from the original download checkpoint:\n\n```bash\n# Copy the tokenizer needed by the Eval Harness\ncp checkpoints/microsoft/phi-2/tokenizer*\nout/converted_model\n```\n\nThen, we can run the Evaluation Harness as follows:, &nbsp;\n## Evaluating models\n\nLitGPT comes with a handy `litgpt evaluate` command to evaluate models with [Eleuther AI's Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness). For example, to evaluate the previously downloaded `microsoft/phi-2` model on several tasks available from the Evaluation Harness, you can use the following command:\n\n```bash\nlitgpt evaluate \\n --checkpoint_dir checkpoints/microsoft/phi-2\n --batch_size 16 \\n --tasks ""hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge""\n```, Please note that the `litgpt evaluate` command run an internal model conversion. \nThis is only necessary the first time you want to evaluate a model, and it will skip the\nconversion steps if you run the `litgpt evaluate` on the same checkpint directory again.\n\nIn some cases, for example, if you modified the model in the `checkpoint_dir` since the first `litgpt evaluate`\ncall, you need to use the `--force_conversion` flag to to update the files used by litgpt evaluate accordingly: \n\n```\nlitgpt evaluate \\n --checkpoint_dir checkpoints/microsoft/phi-2/ \\n --batch_size 4 \\n --out_dir evaluate_model/ \\n --tasks ""hellaswag,truthfulqa_mc2,mmlu"" \\n --force_conversion true\n```, (A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).)\n\n\n&nbsp;\n## Deploy LLMs\n\nYou can deploy LitGPT LLMs using your tool of choice. Below is an example using LitGPT built-in serving capabilities:\n\n\n```bash\n# 1) Download a pretrained model (alternatively, use your own finetuned model)\nlitgpt download --repo_id microsoft/phi-2\n\n# 2) Start the server\nlitgpt serve --checkpoint_dir checkpoints/microsoft/phi-2\n```]",chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta,1,"Feedback: The response provided is incorrect based on the reference answer and the context given. The reference answer specifies the use of the `lm_eval` command with specific parameters for running the evaluation harness, while the response given details the use of a `litgpt evaluate` command with different parameters. This indicates a misunderstanding or misinterpretation of the question or the documents provided. The response fails to accurately reflect the correct command (`lm_eval`) and parameters as outlined in the reference answer and supported by the context documents. Therefore, the response does not meet the criteria for being correct, accurate, and factual in relation to the reference answer.",5,"The response correctly identifies the command to run the evaluation harness as `litgpt evaluate` with the appropriate flags and parameters. It provides a clear and accurate template for the command, including placeholders for the model name, batch size, and tasks, as well as optional flags. The response also correctly notes the automatic conversion feature of the `litgpt evaluate` command and the use of the `--force_conversion` flag when necessary. The provided command matches the context given in the documents, specifically Document 4 and Document 5, which detail the use of `litgpt evaluate` for running the Evaluation Harness. Therefore, the response is completely correct, accurate, and factual based on the reference answer and the context provided.",./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta.json


In [40]:
result["eval_score_GPT4_0125"] = result["eval_score_GPT4_0125"].apply(
    lambda x: int(x) if isinstance(x, str) else 1
)
result["eval_score_GPT4_0125"] = (result["eval_score_GPT4_0125"] - 1) / 4

result["eval_score_GPT4_1106"] = result["eval_score_GPT4_1106"].apply(
    lambda x: int(x) if isinstance(x, str) else 1
)
result["eval_score_GPT4_1106"] = (result["eval_score_GPT4_1106"] - 1) / 4

In [41]:
average_scores = result.groupby("settings")["eval_score_GPT4_1106"].mean()
average_scores.sort_values()

settings
./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta.json        0.758333
./output/rag_chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:False_reader-model:zephyr-7b-beta.json    0.800000
./output/rag_chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:True_reader-model:zephyr-7b-beta.json     0.800000
./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta.json         0.866667
Name: eval_score_GPT4_1106, dtype: float64

In [42]:
average_scores = result.groupby("settings")["eval_score_GPT4_0125"].mean()
average_scores.sort_values()

settings
./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta.json        0.725000
./output/rag_chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:False_reader-model:zephyr-7b-beta.json    0.816667
./output/rag_chunk:200_embeddings:BAAI~bge-small-en-v1.5_rerank:True_reader-model:zephyr-7b-beta.json     0.816667
./output/rag_chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta.json         0.858333
Name: eval_score_GPT4_0125, dtype: float64