<a href="https://colab.research.google.com/github/pathrikarsatwit/SecureRAGFramework/blob/main/Baseline_RAG_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Requirements

In [None]:
!pip install torch
!pip install transformers
!pip install sentence-transformers
!pip install faiss-cpu
!pip install datasets
!pip install accelerate



# Knowledge Corpus


In [None]:
from datasets import load_dataset

ds = load_dataset("sentence-transformers/wikipedia-sections", "pair", split="train[:1000]")
print("ds: ", ds)
documents = [row["anchor"] for row in ds]
print("Total sections:", len(documents))


ds:  Dataset({
    features: ['anchor', 'positive'],
    num_rows: 1000
})
Total sections: 1000


# Chunking

In [None]:
# from typing import List
# import tiktoken

# def chunk_text(text: str, chunk_tokens=400, overlap=50):
#     enc = tiktoken.get_encoding("cl100k_base")
#     tokens = enc.encode(text)

#     chunks = []
#     start = 0
#     while start < len(tokens):
#         end = start + chunk_tokens
#         chunk = enc.decode(tokens[start:end])
#         chunks.append(chunk)
#         start = end - overlap
#         if start < 0:
#             start = 0
#     return chunks

# documents = []
# for row in ds:
#     chunks = chunk_text(row["text"])
#     documents.extend(chunks)

# print("Total chunks:", len(documents))


# Embedding

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
embeddings = embed_model.encode(documents, batch_size=64, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
print("Embedding shape:", embeddings.shape)


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Embedding shape: (1000, 384)


# Indexing

In [None]:
import faiss

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print("FAISS index size:", index.ntotal)

FAISS index size: 1000


# Retrieval

In [None]:
def embed_query(query: str):
    vec = embed_model.encode([query])
    return np.array(vec).astype("float32")

def retrieve(query, k=5):
    qvec = embed_query(query)
    distances, indices = index.search(qvec, k)
    return [documents[i] for i in indices[0]]

In [None]:
query = "Where is Fort d'Embourg located?"
docs = retrieve(query, k=3)
for d in docs:
    print("\n---\n", d[:300])


---
 The Fort d'Embourg is located about 7km southeast of the center of Liège, on the heights above the community of Chaudfontaine, overlooking the Vesdre valley.

---
 Embourg's armament included a Grüsonwerke turret with a single 21 cm Krupp gun, a15cm Creusot turret with twin guns and a 12 cm Châtillon-Commentry turret with two Krupp guns, all for distant targets.

---
 Embourg's armament included a Grüsonwerke turret with a single 21 cm Krupp gun, a15cm Creusot turret with twin guns and a 12 cm Châtillon-Commentry turret with two Krupp guns, all for distant targets.


# Re Ranking

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

reranker_model = AutoModelForSequenceClassification.from_pretrained(
    "cross-encoder/ms-marco-MiniLM-L-6-v2"
)

reranker_tokenizer = AutoTokenizer.from_pretrained(
    "cross-encoder/ms-marco-MiniLM-L-6-v2"
)

# 🔧 IMPORTANT FIX FOR PAD TOKEN
if reranker_tokenizer.pad_token is None:
    reranker_tokenizer.pad_token = reranker_tokenizer.eos_token

def rerank(query, docs, batch_size=8):
    scored = []
    for i in range(0, len(docs), batch_size):
        batch = docs[i:i+batch_size]
        inputs = reranker_tokenizer(
            [query] * len(batch),
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            scores = reranker_model(**inputs).logits.squeeze(-1)

        for doc, score in zip(batch, scores):
            scored.append((doc, score.item()))

    scored.sort(key=lambda x: x[1], reverse=True)
    return scored


Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
ranked = rerank(query, docs)
best_context = ranked[0][0]
print("\nBest Context:\n", best_context[:800])



Best Context:
 The Fort d'Embourg is located about 7km southeast of the center of Liège, on the heights above the community of Chaudfontaine, overlooking the Vesdre valley.


# Model

In [None]:
from huggingface_hub import login
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')


if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("HF_TOKEN not found in Colab secrets. Please add it.")


Successfully logged in to Hugging Face!


In [None]:
# !hf auth login

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "meta-llama/Llama-3.2-3B-Instruct"

llama_tokenizer = AutoTokenizer.from_pretrained(model_id, token=True)
llama_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=True,
    device_map="auto",
    torch_dtype=torch.float16
)


llama_tokenizer.pad_token = llama_tokenizer.eos_token


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/254 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

# Generation

In [None]:
def generate_answer(context, query):
    prompt = f"""You are a helpful assistant.
Answer the question using only the information in the provided context.


Context:
{context}

Question:
{query}

Answer:"""

    inputs = llama_tokenizer(
        prompt,
        return_tensors="pt",
        padding=True
    ).to(llama_model.device)

    output = llama_model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=False
    )

    return llama_tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
print(generate_answer(best_context, query))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


You are a helpful assistant.
Answer the question using only the information in the provided context.


Context:
The Fort d'Embourg is located about 7km southeast of the center of Liège, on the heights above the community of Chaudfontaine, overlooking the Vesdre valley.

Question:
Where is Fort d'Embourg located?

Answer: Fort d'Embourg is located about 7km southeast of the center of Liège, on the heights above the community of Chaudfontaine, overlooking the Vesdre valley.


# End to End Pipeline

In [None]:
def rag_pipeline(query, k=5):

    retrieved = retrieve(query, k=k)

    ranked = rerank(query, retrieved)


    best_context = ranked[0][0]


    answer = generate_answer(best_context, query)

    return answer


In [None]:
query = "Where is Fort d'Embourg located?"
answer = rag_pipeline(query, k=5)
print("\nRAG Answer:\n", answer)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



RAG Answer:
 You are a helpful assistant.
Answer the question using only the information in the provided context.


Context:
The Fort d'Embourg is located about 7km southeast of the center of Liège, on the heights above the community of Chaudfontaine, overlooking the Vesdre valley.

Question:
Where is Fort d'Embourg located?

Answer: Fort d'Embourg is located about 7km southeast of the center of Liège, on the heights above the community of Chaudfontaine, overlooking the Vesdre valley.


In [None]:
query = "Where is France?"
answer = rag_pipeline(query, k=5)
print("\nRAG Answer:\n", answer)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



RAG Answer:
 You are a helpful assistant.
Answer the question using only the information in the provided context.


Context:
The Fort d'Embourg is located about 7km southeast of the center of Liège, on the heights above the community of Chaudfontaine, overlooking the Vesdre valley.

Question:
Where is France?

Answer: France is not mentioned in the context. The context only mentions the location of the Fort d'Embourg in relation to Liège, Chaudfontaine, and the Vesdre valley. It does not provide any information about France.


# Testing

In [None]:
# Evaluation Dataset
eval_samples = ds.shuffle(seed=42).select(range(100))  # 100 random samples

queries = [row["anchor"] for row in eval_samples]
gold_texts = [row["positive"] for row in eval_samples]


In [None]:
def recall_at_k(queries, gold_texts, k=5):
    hits = 0
    logs = []

    for q, gold in zip(queries, gold_texts):
        retrieved = retrieve(q, k=k)

        hit = any(gold.strip() in doc for doc in retrieved)
        hits += int(hit)

        logs.append({
            "query": q,
            "hit": hit
        })

    return hits / len(queries), logs


In [None]:
recall5, recall_logs = recall_at_k(queries, gold_texts, k=5)
print("Recall@5:", recall5)


Recall@5: 0.0


In [None]:
def semantic_similarity_eval(queries, gold_texts):
    sims = []
    logs = []

    for q, gold in zip(queries, gold_texts):
        answer = rag_pipeline(q)

        ans_emb = embed_model.encode(answer)
        gold_emb = embed_model.encode(gold)

        sim = float(
            np.dot(ans_emb, gold_emb) /
            (np.linalg.norm(ans_emb) * np.linalg.norm(gold_emb))
        )

        sims.append(sim)
        logs.append({
            "query": q,
            "similarity": sim
        })

    return float(np.mean(sims)), logs


In [None]:
avg_similarity, sim_logs = semantic_similarity_eval(queries, gold_texts)
print("Avg semantic similarity:", avg_similarity)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Avg semantic similarity: 0.3366499989107251


In [None]:
def hallucination_rate(queries, threshold=0.6):
    hallucinations = 0
    logs = []

    for q in queries:
        retrieved = retrieve(q, k=1)
        context = retrieved[0]
        answer = rag_pipeline(q)

        ans_emb = embed_model.encode(answer)
        ctx_emb = embed_model.encode(context)

        sim = float(
            np.dot(ans_emb, ctx_emb) /
            (np.linalg.norm(ans_emb) * np.linalg.norm(ctx_emb))
        )

        hallucinated = sim < threshold
        hallucinations += int(hallucinated)

        logs.append({
            "query": q,
            "similarity": sim,
            "hallucinated": hallucinated
        })

    return hallucinations / len(queries), logs


In [None]:
hall_rate, hall_logs = hallucination_rate(queries)
print("Hallucination rate:", hall_rate)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Hallucination rate: 0.04


In [None]:
results = {
    "recall@5": recall5,
    "avg_semantic_similarity": avg_similarity,
    "hallucination_rate": hall_rate
}

print(
    f"Recall@5={recall5:.4f} | "
    f"SemanticSim={avg_similarity:.4f} | "
    f"HallucinationRate={hall_rate:.4f}"
)


Recall@5=0.0000 | SemanticSim=0.3366 | HallucinationRate=0.0400
