# Retrieval Augmented Generation on the Natural Questions dataset

This is an example of RAG for Natural Questions dataset.
```bash
pip install -r requirements.txt
```

In [1]:
from llments.datastore.pyserini_datastore import PyseriniDatastore
from llments.eval.rag import QAEvaluator
from llments.eval.rag import QAEvalContext
from llments.lm.base.hugging_face import HuggingFaceLM
from llments.lm.rag import RAGLanguageModel
from tqdm.auto import tqdm
import json
import statistics
import torch
import warnings
warnings.simplefilter("ignore")

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")          # GPU available, select GPU as device
    print("CUDA is available! Using GPU.")
else:
    device = torch.device("cpu")           # No GPU available, fall back to CPU
    print("CUDA is not available. Using CPU.")

CUDA is available! Using GPU.


## Encode the Documents file provided in jsonl format

The following code generates an encoding for Documents.

In [3]:
language_model = HuggingFaceLM('mistralai/Mistral-7B-v0.1', device=device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Trial for RAG

In [14]:
datastore = PyseriniDatastore(index_path='/data/tir/projects/tir7/user_data/mihirban/NQ/colbert/NQ_index_passage-0', document_path='/data/tir/projects/tir7/user_data/mihirban/NQ/wiki_par.jsonl', index_encoder='colbert-ir/colbertv2.0', fields=['contents'], docid_field="id", pooling='mean', to_faiss=True, device=device)

In [15]:
rag_LM = RAGLanguageModel(base=language_model, datastore=datastore, max_results=3)

Loading the index...
Index loaded successfully!
Loading the document file...
Documents loaded successfully!


In [16]:
rag_LM.set_max_results(max_results=1)

In [17]:
lm_response = rag_LM.generate(condition='when did the east india company take control of india?', max_new_tokens=10, temperature=0.7, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [18]:
for i in range(0, len(lm_response)):
    print(f"Response {i} : " + lm_response[i].split("\n")[0])

Response 0 : 1757


### Trial for baseline

In [12]:
condition = "when did the east india company take control of india?"
prompt = "Please answer the following question.\nQuestion: " + condition + "\nAnswer: "
lm_response = [x.split("Answer: ")[1].strip() for x in language_model.generate(condition=prompt, max_new_tokens=50, temperature=0.7, num_return_sequences=1)]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [13]:
for i in range(0, len(lm_response)):
    print(f"Response {i} : " + lm_response[i].split("\n")[0])

Response 0 : 1757


## Evaluation

In [8]:
# Load the JSON file
with open('/data/tir/projects/tir7/user_data/mihirban/NQ/gold_nq_zeno_file.json') as f:
    data = json.load(f)

### Baseline (without RAG)

In [32]:
answer_set_list = []
outputs = []
x = 0

# Iterate over each item in the JSON data with tqdm
with tqdm(data, desc="Processing") as pbar:
    for item in pbar:
        # Extract answers and input for each item
        answer_set = item['output']['answer_set']
        condition = item['input'] + "?"
        prompt = "Please answer the following question.\nQuestion: " + condition + "\nAnswer: "
        lm_response = [x.split("Answer: ")[1].strip() for x in language_model.generate(condition=prompt, max_new_tokens=100, temperature=0.7, num_return_sequences=1)]

        outputs.append(lm_response[0].split("\nQuestion")[0])
        answer_set_list.append(QAEvalContext(answer_set))
        if x == 5:
            break
        x += 1

Processing:   0%|          | 0/2837 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [33]:
evaluator_f1 = QAEvaluator(metric='f1')
f1 = evaluator_f1.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_accuracy = QAEvaluator(metric='accuracy')
acc = evaluator_accuracy.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_exact_match = QAEvaluator(metric='exact_match')
em_acc = evaluator_exact_match.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_rougel = QAEvaluator(metric='rougel')
rouge = evaluator_rougel.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

mean_f1 = statistics.mean(f1)
mean_acc = statistics.mean(acc)
mean_em_acc = statistics.mean(em_acc)
mean_rouge = statistics.mean(rouge)

Evaluating: 100%|██████████| 6/6 [00:00<00:00, 5837.58it/s]
Evaluating: 100%|██████████| 6/6 [00:00<00:00, 29127.11it/s]
Evaluating: 100%|██████████| 6/6 [00:00<00:00, 8895.66it/s]
Evaluating: 100%|██████████| 6/6 [00:00<00:00, 1788.62it/s]


In [34]:
print("Mean F1:", mean_f1)
print("Mean Accuracy:", mean_acc)
print("Mean Exact Match Accuracy:", mean_em_acc)
print("Mean ROUGE-L:", mean_rouge)

Mean F1: 0.27586659886431053
Mean Accuracy: 0.16666666666666666
Mean Exact Match Accuracy: 0.16666666666666666
Mean ROUGE-L: 0.3047501501134335


### RAG

In [4]:
datastore_nq = PyseriniDatastore(index_path='/data/tir/projects/tir7/user_data/mihirban/NQ/colbert/NQ_index_passage-0', document_path='/data/tir/projects/tir7/user_data/mihirban/NQ/wiki_par.jsonl', index_encoder='colbert-ir/colbertv2.0', fields=['contents'], docid_field="id", pooling='mean', to_faiss=True, device=device)

In [5]:
rag_LM_nq = RAGLanguageModel(base=language_model, datastore=datastore_nq, max_results=3)

Loading the index...
Index loaded successfully!
Loading the document file...
Documents loaded successfully!


In [6]:
rag_LM_nq.set_max_results(max_results=1)

In [18]:
answer_set_list = []
outputs = []
x = 0
import re

# Iterate over each item in the JSON data with tqdm
with tqdm(data, desc="Processing") as pbar:
    for item in pbar:
        # Extract answers and input for each item
        answer_set = item['output']['answer_set']
        condition = item['input'] + "?"

        lm_response = [x.split("Answer: ")[1].strip() for x in rag_LM_nq.generate(condition=condition, max_new_tokens=100, temperature=0.7, num_return_sequences=1)]

        outputs.append(re.split("Please answer the following question.|\nQuestion|\nContext", lm_response[0])[0])
        print(re.split("Please answer the following question.|\nQuestion|\nContext", lm_response[0])[0])
        answer_set_list.append(QAEvalContext(answer_set))
        if x == 5:
            break
        x += 1

Processing:   0%|          | 0/2837 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


3 dots mean that the number is infinite.



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1. Paul McCartney
2. John Lennon
3. George Harrison
4. Ringo Starr



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1. Coldplay
2. Beyonce
3. Bruno Mars
4. Rihanna



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1. Davos, Switzerland.
2. Osaka, Japan.
3. New York, USA.
4. London, UK.



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1. Redwood National Park is located in the northwestern corner of California, along the coast.
2. The park is made up of four separate sections: Del Norte Coast, Jedediah Smith, Prairie Creek, and Redwood Creek.
3. The park is home to the tallest trees in the world, the giant redwoods.
4. The park is also home to a variety of other plants and animals, including black bears, elk, and


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


669




In [19]:
evaluator_f1 = QAEvaluator(metric='f1')
f1 = evaluator_f1.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_accuracy = QAEvaluator(metric='accuracy')
acc = evaluator_accuracy.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_exact_match = QAEvaluator(metric='exact_match')
em_acc = evaluator_exact_match.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

evaluator_rougel = QAEvaluator(metric='rougel')
rouge = evaluator_rougel.evaluate_batch(hyps=outputs, contexts=answer_set_list, show_progress=True)

mean_f1 = statistics.mean(f1)
mean_acc = statistics.mean(acc)
mean_em_acc = statistics.mean(em_acc)
mean_rouge = statistics.mean(rouge)

Evaluating: 100%|██████████| 6/6 [00:00<00:00, 7509.94it/s]
Evaluating: 100%|██████████| 6/6 [00:00<00:00, 32181.36it/s]
Evaluating: 100%|██████████| 6/6 [00:00<00:00, 9221.63it/s]
Evaluating: 100%|██████████| 6/6 [00:00<00:00, 1904.91it/s]


In [20]:
print("Mean F1:", mean_f1)
print("Mean Accuracy:", mean_acc)
print("Mean Exact Match Accuracy:", mean_em_acc)
print("Mean ROUGE-L:", mean_rouge)

Mean F1: 0.1711995733734864
Mean Accuracy: 0
Mean Exact Match Accuracy: 0.0
Mean ROUGE-L: 0.16416916158942982


In [21]:
outputs

['3 dots mean that the number is infinite.\n',
 '1. Paul McCartney\n2. John Lennon\n3. George Harrison\n4. Ringo Starr\n',
 '1. Coldplay\n2. Beyonce\n3. Bruno Mars\n4. Rihanna\n',
 '1. Davos, Switzerland.\n2. Osaka, Japan.\n3. New York, USA.\n4. London, UK.\n',
 '1. Redwood National Park is located in the northwestern corner of California, along the coast.\n2. The park is made up of four separate sections: Del Norte Coast, Jedediah Smith, Prairie Creek, and Redwood Creek.\n3. The park is home to the tallest trees in the world, the giant redwoods.\n4. The park is also home to a variety of other plants and animals, including black bears, elk, and',
 '669\n\n']

In [20]:
answer_set_list

[QAEvalContext(gold_answers=['the therefore sign ( ∴ ) is generally used before a logical consequence , such as the conclusion of a syllogism', 'a logical consequence , such as the conclusion of a syllogism', 'therefore sign', 'the therefore sign']),
 QAEvalContext(gold_answers=['George Harrison', 'Richard Starkey', 'Ringo Starr']),
 QAEvalContext(gold_answers=['Beyoncé', 'Coldplay with special guest performers Beyoncé and Bruno Mars', 'British rock group Coldplay with special guest performers Beyoncé and Bruno Mars', 'Bruno Mars', 'Coldplay']),
 QAEvalContext(gold_answers=['Davos', 'Davos , a mountain resort in Graubünden , in the eastern Alps region of Switzerland']),
 QAEvalContext(gold_answers=['from the northern California coast north to the southern Oregon Coast', 'the coast of northern California', 'Del Norte County', 'Humboldt County']),
 QAEvalContext(gold_answers=['Gareth Barry'])]