In [None]:
%pip install -r requirements.txt

In [None]:
import pandas as pd
from os import path
from datasets import load_dataset
import rage_toolkit as rage
import re
import random

## Load Evaluation Datasets
- Load datasets from Hugging Face.
- Convert the datasets into five dataframes:
    - `queries_df`: Contains queries.
    - `corpus_df`: Contains a corpus of documents with relevant information for the queries.
    - `relevant_df`: Contains a mapping indicating which documents are relevant for a given query.
    - `irrelevant_df`: Contains a mapping indicating which documents are irrelevant for a given query.
    - `seemingly_relevant_df`: Contains a mapping indicating which documents seem to be relevant for a given query but do not contain the actual information necessary to answer the question.

In [None]:
dataset_path = 'othr-nlp/rage_nq'
queries_ds  = load_dataset(dataset_path, "queries", token=True)
corpus_ds   = load_dataset(dataset_path, "corpus", token=True)
mapping_ds  = load_dataset(dataset_path, "mapping", token=True)

In [None]:
queries_df = pd.DataFrame(queries_ds['queries'])
corpus_df = pd.DataFrame(corpus_ds['corpus'])
relevant_df = pd.DataFrame(mapping_ds['relevant'])
irrelevant_df = pd.DataFrame(mapping_ds['irrelevant'])
semirelevant_df = pd.DataFrame(mapping_ds['seemingly_relevant'])

## Define How a Prompt Should Be Generated

This is customizable. Below is the code that was used in the paper.

In [None]:
prompt_template = """Instruction: Write an accurate, engaging and concise answer to the given question using only the documents provided (some of which may be irrelevant) and cite them properly using the format [<doc_id>]. 
For example, if a particular piece of information comes from document 3, cite it as [3]. 
Use an unbiased, journalistic tone. 
Always cite any factual statement.
Place citations at the end of the sentence, before the period.
Example: "Some random text citing documents 3 and 4 [3][4]". 
If you are citing multiple documents, use [1][2][3]. 
Cite at least one document and no more than three documents in each sentence. 
If multiple documents support the sentence, cite only a minimum sufficient subset of the documents.

Documents:

{context}

Question: {query}

Remember to answer as short as possible.
"""

class CustomPromptGenerator(rage.PromptGenerator):
    def __init__(self, prompt_template: str):
        self.prompt_template = prompt_template

    def generate_prompt(self, query_string: str, document_texts: list[str]) -> str:
        docs_string = self._generate_docs_string(document_texts)
        prompt = self.prompt_template.format(query=query_string, context=docs_string)
        return prompt

    def _generate_docs_string(self, docs):
        context_string = ""
        self._filter_cites(docs)
        for i, doc in enumerate(docs):
            context_string += f"[{i + 1}]: {doc}\n\n"
        return context_string

    def _filter_cites(self, docs):
        for i in range(len(docs)):
            pattern = r'\[\d+\]'
            docs[i] = re.sub(pattern, "", docs[i])


## Define a LLM System
- A wrapper around the LLM you want to test.
- Again, this is fully custom.
- The implementation below is a Mock LLM which randomly cites up to 3 documents.

In [None]:
class CustomLLMSystem(rage.LlmSystem):
    def __init__(self):
        pass

    def run_inference(self, prompt: str) -> str:
        num_citations = random.randint(1,3)
        random_ints = [random.randint(1, 9) for _ in range(num_citations)]
        return ", ".join([f"[{citation}]" for citation in random_ints])

## Define an Augmented Generation System

- This is the wrapper which is assumed by the evaluation script.
- It needs to implement the abstract methods `generate_answer()` and `delete()`:
    - During evaluation, `generate_answer()` receives a `query_string` and a mix of relevant, irrelevant, and seemingly-relevant documents (see Paper) and is expected to produce an answer string containing citations in the format `[<doc_id>]` where `doc_id` corresponds to the nth document in the `document_texts` list.
    - `delete()` unloads a model from memory, which is useful when multiple models are tested in a row.
- Apart from this, the Augmented Generation System is also fully customizable.
- This is the only wrapper which is obligatory. The evaluation script expects an Augmented Generation System which inherits from `rage.AugmentedGenerationSystem`. The `rage.LLMSystem` and `rage.PromptGenerator` classes are only suggestions which don't have to be used.


In [None]:
class CustomAugmentedGenerationSystem(rage.AugmentedGenerationSystem):
    def __init__(self, llm_system: rage.LlmSystem, prompt_generator: rage.PromptGenerator):
        self.llm_system: rage.LlmSystem = llm_system
        self.prompt_generator: rage.PromptGenerator = prompt_generator

    def generate_answer(self, query_string: str, document_texts: list[str]) -> tuple[str, str]:
        prompt = self.prompt_generator.generate_prompt(query_string, document_texts=document_texts)
        llm_answer = self.llm_system.run_inference(prompt)
        return prompt, llm_answer
    
    def delete(self):
        self.llm_system = None
        self.prompt_generator = None

## Combine Everything

In [None]:
prompt_generator: rage.PromptGenerator = CustomPromptGenerator(prompt_template=prompt_template)
llm_system: rage.LlmSystem = CustomLLMSystem()
augmented_generation_system: rage.AugmentedGenerationSystem = CustomAugmentedGenerationSystem(llm_system, prompt_generator)

## Evaluate the Augmented Generation System with RAGE

In [None]:
evaluation_results_save_path = "results/"
run_id = f"my_run_id"

queries_response_information, queries_evaluation_results = rage.evaluate(
    augmented_generation_system=augmented_generation_system,
    queries_df=queries_df,
    corpus_df=corpus_df,
    relevant_df=relevant_df,
    irrelevant_df=irrelevant_df,
    semirelevant_df=semirelevant_df,
    proportions_mix = (3, 3, 3),
    evaluation_results_save_path=evaluation_results_save_path,
    num_queries=10,
    run_id = run_id
)

## Display Results

In [None]:
queries_response_information.head()

In [None]:
queries_evaluation_results.describe()