# RAG Pipeline on Wikipedia Dataset

In [None]:
!pip install opendatasets faiss-cpu sentence_transformers
import nltk
import faiss
import numpy as np
import re
import opendatasets as od
import torch
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



In [None]:
od.download('https://www.kaggle.com/datasets/ffatty/plain-text-wikipedia-simpleenglish')

Skipping, found downloaded files in "./plain-text-wikipedia-simpleenglish" (use force=True to force download)


## Chunking the text

In [None]:
with open('plain-text-wikipedia-simpleenglish/AllCombined.txt', 'r') as f:
  text = f.read()

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from huggingface_hub import notebook_login
notebook_login()

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, dtype=torch.float32)

model = model.to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



In [None]:
sentences = sent_tokenize(text)
chunks, current = [], []
token_count = 0

min_tokens = 200
max_tokens = 300

for sent in sentences:
  sent_tokens = len(tokenizer.encode(sent, add_special_tokens=False))

  if sent_tokens > max_tokens:
      continue

  if token_count + sent_tokens > max_tokens:
      if token_count >= min_tokens:
          chunks.append(" ".join(current))
      current = [sent]
      token_count = sent_tokens
  else:
      current.append(sent)
      token_count += sent_tokens

if token_count >= min_tokens:
    chunks.append(" ".join(current))

Token indices sequence length is longer than the specified maximum sequence length for this model (3553 > 512). Running this sequence through the model will result in indexing errors


## Semantic Retriever

In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
embeddings = embedder.encode(chunks)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

In [None]:
def retrieve(query, k=3, min_sim=0.3):
  query_embedding = embedder.encode([query])
  scores, idx = index.search(query_embedding, k)

  results = []
  for score, i in zip(scores[0], idx[0]):
    if score >= min_sim:
       results.append(chunks[i])

  return results

## Information Check

In [None]:
def info_check(retrieved):
  return len(retrieved) > 0

## Building the Answerer LLM

In [None]:
def gen_answer(prompt):
  inputs = tokenizer(prompt, return_tensors="pt")
  inputs = {k: v.to(model.device) for k, v in inputs.items()}

  input_len = inputs["input_ids"].shape[1]

  outputs = model.generate(
      **inputs,
      max_new_tokens=150,
      do_sample=False,
  )

  generated_tokens = outputs[0]

  return tokenizer.decode(generated_tokens, skip_special_tokens=True)

## Post Processing

In [None]:
def post_process(text):
    # Remove parentheses
    text = re.sub(r"\([^)]*\)", "", text)

    # Fix spacing
    text = re.sub(r"\s+", " ", text).strip()

    sentences = sent_tokenize(text)

    merged = []
    for s in sentences:
        if merged and len(s.split()) < 6:
            merged[-1] += " " + s
        else:
            merged.append(s)

    # Enforce 2â€“3 sentences
    merged = merged[:3]

    return " ".join(merged)


## Question Answering

In [None]:
def truncate_context(chunks, tokenizer, max=1500):
  kept = []
  total = 0

  for c in chunks:
    tokens = tokenizer.encode(c)
    if total + len(tokens) > max:
      break
    kept.append(c)
    total += len(tokens)

  return kept

In [None]:
def answer_question(question):
    retrieved = retrieve(question, 3, 0.3)
    retrieved = truncate_context(retrieved, tokenizer, 500)

    if not info_check(retrieved):
        return "Not enough information in the Simple Wikipedia dataset."

    context = "\n".join(retrieved)
    prompt = f"""
    Use only the text below to answer the question.
Do not add new facts.
If the text does not answer the question, say:
"Not enough information in the Simple Wikipedia dataset."

Text:
{context}

Question:
{question}

Answer:

    """
    raw = gen_answer(prompt)
    final = post_process(raw)


    return final


## Evaluation

### Working check

In [None]:
answer_question("What is capital of France?")

'Paris.'

### Type 1: Single chunk Factual Question
Hard coded simple fact based questions.

In [None]:
answer_question("What is water?")

'molecule made of two hydrogen atoms and one oxygen atom.'

In [None]:
answer_question("What is sun?")

'A star like many others in our Milky Way galaxy. The Sun is a type of star called a G-type main-sequence star based on its spectral class. The Sun has existed for a little over 4.5 billion years.'

In [None]:
answer_question("What is mount_everest")

'Mount Everest is the highest mountain on Earth.'

### Type 2: Rewriting Check
Ensure that model isn't copying text.

In [None]:
answer_question("Why are bees important?")

'Pollinators for many plants.'

In [None]:
answer_question("What is electricity?")

'Electrical energy is mostly generated in places called power stations.'

### Type 3: Multi Fact Questions

In [None]:
answer_question("What was World War II and when was it fought??")

'World War II began in 1939.'

In [None]:
answer_question("Who were alies in World War II? Name countires.")

'Russia, France, the British Empire and later the United States.'

It struggles a little with multi-fact questions.

### Type 4: Refusal Test.
Check whether it answers correctly or not for data not in dataset.

In [None]:
answer_question("Who was the president of United States in 2023?")

'Not enough information in the Simple Wikipedia dataset.'

In [None]:
answer_question("What is population of Mars?")

'Not enough information in the Simple Wikipedia dataset.'

### Type 5: Guess Test

We will check if model guesses or not.

In [None]:
answer_question("Who won World War III?")

'The United States and Western Europe'

In [None]:
answer_question("What did Isacc Newton tweet?")

'Not enough information in the Simple Wikipedia dataset.'

It still guesses some of the information.

### Conclusion

The RAG Pipeline performs really well on most queries and reverts when information is not present. Overall we find a great implemetation of the whole work.

## Research Questions

**Q.1** How often does the agent hallucinate facts not present in retrieved text?

**Ans.** As clear from type 5 questions, if the data is absurd like 'What did Isaac Newton tweet?', It'll not hallucinate, but if the data closer to real life 'like World War III' which might had some arbiratory reference in text, the model has higher chance of hallucinating.
Also as shown by type 4 questions, being asked for explicit answers (like population of mars or name of president in 2023) the model has much higher chance of not guessing.

**Q.2** How sensitive is answer quality to retrieval errors?

**Ans.** It is quite specific fro small models like flan-t5-small, where it tended to treat wrong retrievels as only truth (like if there was any city name it would classify it as capital of paris without knowing whether it really was capital related or just a random drop.). However when we switch to larger models like flan-t5-base, we observed that it better understood the grammatical structure of test and could differentiate between values.

**Q.3** How does reliability change across 25M, 80M, and 250M parameter models?

**Ans.** It changes significantly. Although I could not find an appropriate 25M model with a context window enough to run the model, I tried with faln-t5-small with 80M parameter. The major change across reliability was relying on context size change since 512 token model was not able to get enough retrieved text to generate accurately and chose to stay silent.

**Q.4** Which matters more: strict prompting or rule-based post-editing?

**Ans.** Based on my observations, strict prompting is much more important than post editing. In this case providing model with a prompt to refuse was much more helpful than trying to implement it via rule based editing. As the model sizes imporved, I could achieve better grammar and reduced depenendence on post based editing.

**Q.5** How accurately does the agent refuse when information is missing?

**Ans.** As shown by types 5 and 4, the agent will refuse if it does not have concrete facts but if something vaguely related or has mentions like 'World War 3' it will try to guess/hallucinate.