In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
from pprint import pprint
import re

  from .autonotebook import tqdm as notebook_tqdm


In [12]:

repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
    token="HF-Token-here",
)

def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.chat_completion(
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0.7
    )
    return response.choices[0].message["content"]

print(
    call_llm(
        llm_client,
        "Explain the theory of relativity in simple terms."
    )
)

The theory of relativity, developed by Albert Einstein, is a fundamental concept in modern physics that explains how space and time are connected. It's a complex topic, but I'll try to break it down in simple terms.

**Special Relativity (1905)**

Einstein's special theory of relativity states that the laws of physics are the same for all observers in uniform motion relative to one another. This means that:

1. **Time and space are relative**: Time and space are not


In [3]:
# To save token, we predefine outputs for benchmarking
outputs = [
    {
        "context": "Gravity pulls objects toward Earth, causing them to fall.",
        "question": "Why do things fall?"
    },
    {
        "context": "Light travels faster than sound in air.",
        "question": "Does light or sound travel faster?"
    },
    {
        "context": "Cells are the basic building blocks of living organisms.",
        "question": "What are cells?"
    },
]

question_groundedness_critique_prompt = """
You are evaluating whether the question is grounded in the given context.

Context:
{context}

Question:
{question}

Evaluation: Explain if the question can be answered using only the context or not.
Total rating: Give a score from 1-5 (5 means fully grounded, 1 means not grounded at all).
"""

question_relevance_critique_prompt = """
You are evaluating whether the question is relevant and meaningful.

Question:
{question}

Evaluation: Explain whether the question is relevant or useful.
Total rating: Give a score from 1-5.
"""

question_standalone_critique_prompt = """
You are evaluating whether the question can stand on its own without needing extra context.

Question:
{question}

Evaluation: Explain whether the question can be understood independently.
Total rating: Give a score from 1-5.
"""

In [4]:
print("Generating critique for each QA couple...")

for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(
            llm_client,
            question_groundedness_critique_prompt.format(
                context=output["context"], 
                question=output["question"]
            ),
        ),
        "relevance": call_llm(
            llm_client,
            question_relevance_critique_prompt.format(
                question=output["question"]
            ),
        ),
        "standalone": call_llm(
            llm_client,
            question_standalone_critique_prompt.format(
                question=output["question"]
            ),
        ),
    }

    try:
        for criterion, evaluation in evaluations.items():
            # Extract score (supports 4, 5, 4/5, 4 out of 5)
            match = re.search(r'(\d+)', evaluation)
            score = int(match.group(1)) if match else None

            # Extract evaluation text
            if "Evaluation:" in evaluation:
                eval_ = evaluation.split("Evaluation:", 1)[1] \
                                .split("Rating:", 1)[0] \
                                .split("Total rating:", 1)[0] \
                                .strip()
            else:
                eval_ = evaluation[:200].strip()

            output.update({
                f"{criterion}_score": score,
                f"{criterion}_eval": eval_,
            })

    except Exception:
        continue

for o in outputs:
    pprint(o)
    print("-" * 40)



Generating critique for each QA couple...


100%|██████████| 3/3 [00:16<00:00,  5.63s/it]

{'context': 'Gravity pulls objects toward Earth, causing them to fall.',
 'groundedness_eval': 'The question "Why do things fall?" can be partially '
                      'answered using the given context. \n'
                      '\n'
                      'The context states that "Gravity pulls objects toward '
                      'Earth, causing them to fall." This statement directly '
                      'answ',
 'groundedness_score': 4,
 'question': 'Why do things fall?',
 'relevance_eval': 'The question "Why do things fall?" is a fundamental '
                   'inquiry that has been a subject of interest for centuries. '
                   'It touches upon the basic concept of gravity, which is a '
                   'crucial aspect of our understanding of the physical world. '
                   'The question is relevant and useful because it:\n'
                   '\n'
                   '1. Encourages critical thinking: The question prompts the '
                   'in




In [5]:
from typing import List
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


def split_documents(
    chunk_size: int,
    knowledge_base: List[Document],
    tokenizer_name: str,
) -> List[Document]:
    """
    Split documents into chunks of size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", " ", "", ",", ".", "!", "?"],
    )

    docs_processed: List[Document] = []
    for doc in knowledge_base:
        docs_processed.extend(text_splitter.split_documents([doc]))

    # Remove duplicates based on text content
    seen = set()
    docs_unique: List[Document] = []

    for doc in docs_processed:
        if doc.page_content not in seen:
            seen.add(doc.page_content)
            docs_unique.append(doc)

    return docs_unique


In [46]:
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from typing import List, Optional
from langchain_core.documents import Document as LangchainDocument
import os

def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "thenlper/gte-small",
):
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: List of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use
    Returns:
        FAISS index
    """
    # load embedding model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cpu"},
        encode_kwargs={
            "normalize_embeddings": True
        }
    )

    # check if embeddings already exist on disk
    index_name = (
        f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '_')}"
    )
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )

        knowledge_index.save_local(index_folder_path)
        return knowledge_index



In [53]:
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM
from typing import List, Tuple

def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
    reranker = None,
) -> Tuple[str, List[str]]:

    # retrieve
    relevant_docs = knowledge_index.similarity_search(
        question, k=num_retrieved_docs
    )

    # keep only text content
    relevant_docs = [doc.page_content for doc in relevant_docs]

    # truncate to final doc count
    relevant_docs = relevant_docs[:num_docs_final]

    # build prompt
    context = "\n\nExtracted documents:\n"
    context += "\n".join(
        [f"[Doc {i}]:\n{doc}" for i, doc in enumerate(relevant_docs)]
    )

    prompt = f"""
Answer the following question using the documents provided.

Question: {question}

{context}

Answer:
"""

    answer = llm(prompt)
    return answer, relevant_docs


In [44]:
import json
from langchain_core.language_models import BaseChatModel
from typing import Dict, List, Optional

try:
    from ragatouille import RAGPretrainedModel
except ImportError:
    RAGPretrainedModel = None

def run_rag_tests(
    # eval_dataset: datasets.Dataset,
    eval_dataset: List[Dict],
    llm: BaseChatModel,
    knowledge_index: VectorStore,
    output_file: str,
    # reranker: Optional[RAGPretrainedModel] = None,
    reranker: None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None, # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    # Load previous generations if they exist
    try:
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(
            question, llm, knowledge_index, reranker=reranker
        )

        if verbose:
            print("===========================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f"True answer: {example['answer']}")
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }

        if test_settings:
            result["test_settings"] = test_settings

        outputs.append(result)

    with open(output_file, "w") as f:
        json.dump(outputs, f)



In [51]:
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document as LangchainDocument
from typing import List


class MockVectorStore:
    def __init__(self, docs: List[str], vectors: np.ndarray, embedder):
        self.docs = docs
        self.vectors = vectors
        self.embedder = embedder

    def _cosine_sim(self, a, b):
        a = a / (np.linalg.norm(a) + 1e-9)
        b = b / (np.linalg.norm(b) + 1e-9)
        return np.dot(a, b)

    def similarity_search(self, query: str, k: int = 4):
        q_vec = self.embedder.embed_query(query)
        sims = [(self._cosine_sim(q_vec, self.vectors[i]), i) for i in range(len(self.docs))]
        sims = sorted(sims, key=lambda x: x[0], reverse=True)[:k]
        return [LangchainDocument(page_content=self.docs[i]) for _, i in sims]

def load_embeddings_mock(langchain_docs, chunk_size, embedding_model_name="thenlper/gte-small"):
    embedder = HuggingFaceEmbeddings(model_name=embedding_model_name)

    docs = [d.page_content for d in langchain_docs]
    vectors = [embedder.embed_query(text) for text in docs]
    vectors = np.array(vectors, dtype=np.float32)

    return MockVectorStore(docs, vectors, embedder)


In [55]:
from langchain_core.documents import Document as LangchainDocument

langchain_docs = [
    LangchainDocument(page_content="Retrieval-Augmented Generation (RAG) retrieves context before generating answers."),
    LangchainDocument(page_content="FAISS is a vector database for fast similarity search.")
]
chunk_size = 512

# knowledge_index = load_embeddings(
#     langchain_docs=langchain_docs,
#     chunk_size=chunk_size,
#     embedding_model_name="thenlper/gte-small",
# )

knowledge_index = load_embeddings_mock(
    langchain_docs=langchain_docs,
    chunk_size=512,
    embedding_model_name="thenlper/gte-small",
)

eval_dataset = [
    {
        "question": "What is RAG?",
        "answer": "A method that retrieves context before generation.",
        "source_doc": "rag_intro.txt"
    },
    {
        "question": "What is FAISS?",
        "answer": "A vector index for similarity search.",
        "source_doc": "faiss_overview.txt"
    },
]


outputs_path = "rag_outputs.json"

run_rag_tests(
    eval_dataset=eval_dataset,
    llm=lambda prompt: call_llm(llm_client, prompt),
    knowledge_index=knowledge_index,
    output_file=outputs_path,
    reranker=None,
    verbose=True,
    test_settings="first-run"
)


 50%|█████     | 1/2 [00:01<00:01,  1.41s/it]

Question: What is RAG?
Answer: Based on the documents provided, RAG (Retrieval-Augmented Generation) is a method or system that retrieves context before generating answers.
True answer: A method that retrieves context before generation.


100%|██████████| 2/2 [00:02<00:00,  1.19s/it]

Question: What is FAISS?
Answer: FAISS is a vector database for fast similarity search.
True answer: A vector index for similarity search.



