# Imports

In [1]:
!pip install wikipedia
!pip install sentence-transformers
!pip install faiss-cpu


Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=d9065baed7049044da15b44c22bbd7372cf646cdc9a259097663c55c271bfc40
  Stored in directory: /root/.cache/pip/wheels/63/47/7c/a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Suc

In [2]:
import wikipedia
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np



# Vector db

In [3]:
wikipedia.set_lang("en")

topics = ["Football", "Basketball", "Olympic Games"]
documents = {}

for topic in topics:
    documents[topic] = wikipedia.page(topic).content


In [4]:
def clean_text(text):
    text = re.sub(r"\n+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def chunk_text(text, chunk_size=100, overlap=20):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i + chunk_size]))
    return chunks

all_chunks = []
metadata = []

for topic, text in documents.items():
    text = clean_text(text)
    for c in chunk_text(text):
        all_chunks.append(c)
        metadata.append(topic)

print("Total chunks:", len(all_chunks))


Total chunks: 446


In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(all_chunks, show_progress_bar=True)
faiss.normalize_L2(embeddings)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine similarity
index.add(np.array(embeddings))

print("Vectors stored:", index.ntotal)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Vectors stored: 446


# Retrival

In [6]:
def retrieve_with_scores(query, top_k=10):
    query_vec = model.encode([query])
    faiss.normalize_L2(query_vec)

    scores, indices = index.search(query_vec, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        results.append({
            "score": float(score),
            "text": all_chunks[idx],
            "topic": metadata[idx],
            "chunk_id": int(idx)
        })
    return results

MIN_SCORE = 0.45

def guarded_retrieval(query, top_k=10):
    results = retrieve_with_scores(query, top_k=top_k)
    results = [r for r in results if r["score"] >= MIN_SCORE]
    return results if results else None



In [7]:
def build_context_with_sources(results, max_chunks=3):
    context_sentences = []
    sources = []

    for r in results[:max_chunks]:
        sentences = re.split(r'(?<=[.!?])\s+', r["text"])
        for s in sentences:
            s = s.strip()
            if not s:
                continue
            context_sentences.append(s)
            sources.append({
                "topic": r["topic"],
                "retrieval_score": round(r["score"], 3),
                "chunk_id": r["chunk_id"],
                "chunk": r["text"]
            })

    return context_sentences, sources

In [8]:
def generic_sentence_guardrail(sentence):
    bad_terms = ["usually", "between", "include", "some", "variations"]
    return not any(t in sentence.lower() for t in bad_terms)

def generate_answer_with_citation(query, context_sentences, sources, min_sim=0.65):
    if not context_sentences:
        return None, None

    query_emb = model.encode([query])
    sent_embs = model.encode(context_sentences)

    faiss.normalize_L2(query_emb)
    faiss.normalize_L2(sent_embs)

    sims = np.dot(sent_embs, query_emb.T).squeeze()
    ranked = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)

    for idx, sim in ranked:
        sent = context_sentences[idx]
        if sim >= min_sim and generic_sentence_guardrail(sent):
            citation = sources[idx]
            citation["sentence_similarity"] = round(float(sim), 3)
            return sent, citation

    return "I cannot answer this question using the provided context.", None

In [9]:
def compute_confidence(results):
    return round(sum(r["score"] for r in results) / len(results), 3)


In [10]:
def rag_pipeline(query, top_k=10, max_chunks=3):
    results = guarded_retrieval(query, top_k=top_k)

    if results is None:
        return {
            "answer": None,
            "confidence": 0.0,
            "citations": None,
            "status": "Low retrieval confidence"
        }

    context_sentences, sources = build_context_with_sources(results, max_chunks)
    answer, citation = generate_answer_with_citation(query, context_sentences, sources)

    if citation is None:
        return {
            "answer": answer,
            "confidence": 0.0,
            "citations": None,
            "status": "No grounded answer found"
        }

    return {
        "answer": answer,
        "confidence": compute_confidence(results),
        "citations": [citation],
        "status": "OK"
    }

In [11]:
query = "How many players are there in a football team?"
out = rag_pipeline(query)

print("Answer:", out["answer"])
print("Confidence:", out["confidence"])
print("Status:", out["status"])

print("\nCitations:")
if out["citations"]:
    for c in out["citations"]:
        print("- Source topic:", c["topic"])
        print("  Retrieval score:", c["retrieval_score"])
        print("  Sentence similarity:", c["sentence_similarity"])
        print("  Chunk id:", c["chunk_id"])
        print("  Evidence chunk:", c["chunk"])
else:
    print("No citations available.")

Answer: I cannot answer this question using the provided context.
Confidence: 0.0
Status: No grounded answer found

Citations:
No citations available.


# **LLM**
Extractive / Evidence-based Generation

now we add self learning layers on it

In [12]:
#memory

import pandas as pd

interaction_log = pd.DataFrame(columns=[
    "original_query",
    "reformulated_query",
    "retrieved_docs",
    "answer",
    "confidence",
    "feedback",          # 1 = 👍, 0 = 👎, None
    "timestamp"
])

query_memory = pd.DataFrame(columns=[
    "original_query",
    "reformulated_query",
    "confidence",
    "feedback"
])

kb_expansion_buffer = pd.DataFrame(columns=[
    "content",
    "source_query",
    "confidence",
    "timestamp"
])

In [13]:
# functions of self learning module

def estimate_confidence(answer: str, retrieved_docs: list) -> float:
    if not answer or not retrieved_docs:
        return 0.0

    overlap = sum(
        1 for doc in retrieved_docs
        if any(token in answer.lower() for token in doc.lower().split()[:50])
    )

    confidence = min(1.0, overlap / max(1, len(retrieved_docs)))
    return round(confidence, 3)


def reformulate_query(query: str, confidence_threshold=0.7):
    matches = query_memory[
        (query_memory["original_query"] == query) &
        (query_memory["confidence"] >= confidence_threshold) &
        (query_memory["feedback"] == 1)
    ]

    if not matches.empty:
        return matches.iloc[-1]["reformulated_query"]

    return query


def collect_feedback(user_input):
    if user_input is None:
        return None
    if user_input.lower() in ["yes", "y", "👍"]:
        return 1
    if user_input.lower() in ["no", "n", "👎"]:
        return 0
    return None

from datetime import datetime

def log_interaction(
    original_query,
    reformulated_query,
    retrieved_docs,
    answer,
    confidence,
    feedback
):
    global interaction_log

    interaction_log.loc[len(interaction_log)] = {
        "original_query": original_query,
        "reformulated_query": reformulated_query,
        "retrieved_docs": retrieved_docs,
        "answer": answer,
        "confidence": confidence,
        "feedback": feedback,
        "timestamp": datetime.utcnow()
    }


def update_query_memory(original_query, reformulated_query, confidence, feedback):
    global query_memory

    if confidence >= 0.7 and feedback == 1:
        query_memory.loc[len(query_memory)] = {
            "original_query": original_query,
            "reformulated_query": reformulated_query,
            "confidence": confidence,
            "feedback": feedback
        }


def maybe_expand_knowledge(answer, query, confidence, threshold=0.8):
    global kb_expansion_buffer

    if confidence >= threshold:
        kb_expansion_buffer.loc[len(kb_expansion_buffer)] = {
            "content": answer,
            "source_query": query,
            "confidence": confidence,
            "timestamp": datetime.utcnow()
        }

In [14]:
original_query = input("Enter Query: ")

# Query reformulation
reformulated_query = reformulate_query(original_query)

out = rag_pipeline(reformulated_query)
answer = out["answer"]
confidence = out["confidence"]

# Extract retrieved docs from citations
retrieved_docs = []
if out.get("citations"):
    retrieved_docs = [c["chunk"] for c in out["citations"]]


print("Answer:", answer)
print("Confidence:", confidence)
print("Status:", out["status"])
print("\nCitations:")
if out["citations"]:
    for c in out["citations"]:
        print("- Source topic:", c["topic"])
        print("  Retrieval score:", c["retrieval_score"])
        print("  Sentence similarity:", c["sentence_similarity"])
        print("  Chunk id:", c["chunk_id"])
        print("  Evidence chunk:", c["chunk"])
else:
    print("No citations available.")

# Feedback
feedback_input = input("\nIs this answer helpful? (yes/no or Enter): ")
feedback = collect_feedback(feedback_input)

# logging
log_interaction(
    original_query=original_query,
    reformulated_query=reformulated_query,
    retrieved_docs=retrieved_docs,
    answer=answer,
    confidence=confidence,
    feedback=feedback
)

# Query reformulation learning
update_query_memory(
    original_query=original_query,
    reformulated_query=reformulated_query,
    confidence=confidence,
    feedback=feedback
)

# Knowledge base expansion (confidence-aware)
maybe_expand_knowledge(
    answer=answer,
    query=original_query,
    confidence=confidence
)

Enter Query: how many players in a football match?
Answer: I cannot answer this question using the provided context.
Confidence: 0.0
Status: No grounded answer found

Citations:
No citations available.

Is this answer helpful? (yes/no or Enter): no


  "timestamp": datetime.utcnow()
