# Retrieval Augmented Generation on the Natural Questions dataset

This is an example of RAG for Natural Questions dataset.
```bash
pip install -r requirements.txt
```

In [1]:
from llments.datastore.pyserini_datastore import PyseriniDatastore
from llments.eval.rag import RAGEvaluator
from llments.eval.rag import RAGEvalContext
from llments.lm.base.hugging_face import HuggingFaceLM
from llments.lm.rag import RAGLanguageModel
from tqdm.auto import tqdm
import json
import statistics
import torch
import warnings
warnings.simplefilter("ignore")

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")          # GPU available, select GPU as device
    print("CUDA is available! Using GPU.")
else:
    device = torch.device("cpu")           # No GPU available, fall back to CPU
    print("CUDA is not available. Using CPU.")

CUDA is available! Using GPU.


## Encode the Documents file provided in jsonl format

The following code generates an encoding for Documents.

In [3]:
language_model = HuggingFaceLM('mistralai/Mistral-7B-v0.1', device=device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Trial for RAG

In [None]:
datastore = PyseriniDatastore(index_path='examples/rag/NQ_index_passages', document_path='/data/tir/projects/tir7/user_data/mihirban/NQ/wiki_par.jsonl', index_encoder='colbert-ir/colbertv2.0', fields=['contents'], docid_field="id", pooling='mean', to_faiss=True, device=device, shard_id=0 ,shard_num=10)

In [9]:
rag_LM = RAGLanguageModel(base=language_model, datastore=datastore, max_results=1)

In [36]:
lm_response = rag_LM.generate(condition='when did the east india company take control of india?', max_new_tokens=30, temperature=0.7, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [37]:
for i in range(0, len(lm_response)):
    print(f"Response {i} : " + lm_response[i])

Response 0 : 28 March – Bombay is granted to the East India Company.

Question: when did the East India Company take control of india?


### Trial for baseline

In [12]:
condition = "when did the east india company take control of india?"
prompt = "Please answer the following question.\nQuestion: " + condition + "\nAnswer: "
lm_response = [x.split("Answer: ")[1].strip() for x in language_model.generate(condition=prompt, max_new_tokens=10, temperature=0.7, num_return_sequences=1)]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [13]:
for i in range(0, len(lm_response)):
    print(f"Response {i} : " + lm_response[i])

Response 0 : 1757

Question: who was


## Evaluation

In [4]:
# Load the JSON file
with open('/data/tir/projects/tir7/user_data/mihirban/NQ/gold_nq_zeno_file.json') as f:
    data = json.load(f)

### Baseline (without RAG)

In [5]:
answer_set_list = []
outputs = []
x = 0

# Iterate over each item in the JSON data with tqdm
with tqdm(data, desc="Processing") as pbar:
    for item in pbar:
        # Extract answers and input for each item
        answer_set = item['output']['answer_set']
        condition = item['input'] + "?"
        prompt = "Please answer the following question.\nQuestion: " + condition + "\nAnswer: "
        lm_response = [x.split("Answer: ")[1].strip() for x in language_model.generate(condition=prompt, max_new_tokens=10, temperature=0.7, num_return_sequences=1)]

        outputs.append(lm_response[0])
        answer_set_list.append(RAGEvalContext(answer_set))
        if x == 50:
            break
        x += 1

Processing:   0%|          | 0/2837 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 

In [6]:
evaluator_f1 = RAGEvaluator(metric='f1')
f1 = evaluator_f1.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_accuracy = RAGEvaluator(metric='accuracy')
acc = evaluator_accuracy.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_exact_match = RAGEvaluator(metric='exact_match')
em_acc = evaluator_exact_match.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_rougel = RAGEvaluator(metric='rougel')
rouge = evaluator_rougel.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

mean_f1 = statistics.mean(f1)
mean_acc = statistics.mean(acc)
mean_em_acc = statistics.mean(em_acc)
mean_rouge = statistics.mean(rouge) 

Evaluating: 100%|██████████| 51/51 [00:00<00:00, 7906.47it/s]
Evaluating: 100%|██████████| 51/51 [00:00<00:00, 86253.83it/s]
Evaluating: 100%|██████████| 51/51 [00:00<00:00, 27831.06it/s]
Evaluating: 100%|██████████| 51/51 [00:00<00:00, 4799.19it/s]


In [35]:
print("Mean F1:", mean_f1)
print("Mean Accuracy:", mean_acc)
print("Mean Exact Match Accuracy:", mean_em_acc)
print("Mean ROUGE-L:", mean_rouge)

Mean F1: 0.17200647935942054
Mean Accuracy: 0
Mean Exact Match Accuracy: 0
Mean ROUGE-L: 0.1363216288018862


### RAG

In [None]:
answer_set_list = []
outputs = []
rag_LM = RAGLanguageModel(base=language_model, datastore=datastore, max_results=3)
x = 0

# Iterate over each item in the JSON data with tqdm
with tqdm(data, desc="Processing") as pbar:
    for item in pbar:
        # Extract answers and input for each item
        answer_set = item['output']['answer_set']
        condition = item['input'] + "?"
        lm_response = rag_LM.generate(condition=condition, max_new_tokens=30, temperature=0.7, num_return_sequences=1)
        outputs.append(lm_response[0])
        answer_set_list.append(RAGEvalContext(answer_set))
        if x == 50:
            break
        x += 1

In [None]:
evaluator = RAGEvaluator()
f1 = evaluator.evaluate_batch(hyps=outputs, contexts=answer_set_list, metric="f1", show_progress=True)
acc = evaluator.evaluate_batch(hyps=outputs, contexts=answer_set_list, metric="accuracy", show_progress=True)
em_acc = evaluator.evaluate_batch(hyps=outputs, contexts=answer_set_list, metric="exact_match", show_progress=True)
rouge = evaluator.evaluate_batch(hyps=outputs, contexts=answer_set_list, metric="rougel", show_progress=True)

mean_f1 = statistics.mean(f1)
mean_acc = statistics.mean(acc)
mean_em_acc = statistics.mean(em_acc)
mean_rouge = statistics.mean(rouge)

In [None]:
print("Mean F1:", mean_f1)
print("Mean Accuracy:", mean_acc)
print("Mean Exact Match Accuracy:", mean_em_acc)
print("Mean ROUGE-L:", mean_rouge)