In [1]:
# Install dependencies
!pip install -q faiss-cpu rank-bm25 sentence-transformers pandas numpy


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/23.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/23.8 MB[0m [31m231.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m16.3/23.8 MB[0m [31m244.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m23.8/23.8 MB[0m [31m244.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m23.8/23.8 MB[0m [31m244.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m23.8/23.8 MB[0m [31m244.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Imports & Logging
import pandas as pd
import numpy as np
import logging
import faiss
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

In [3]:
import pandas as pd
import logging

# Logging Setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger("RAG")

#Load Kaggle Dataset
DATA_PATH = "/content/arxiv_ai.csv"

try:
    df = pd.read_csv(DATA_PATH, engine='python', on_bad_lines='skip')
    logger.info(f"Dataset loaded with {len(df)} rows")
except Exception as e:
    logger.error("Failed to load dataset", exc_info=True)

In [4]:
# Data Preprocessing
def preprocess_dataframe(df):
    try:
        df = df.fillna("")
        df["document"] = (
            df["title"].str.lower() + ". " +
            df["summary"].str.lower() + ". " +
            "categories: " + df["categories"].str.lower()
        )
        return df["document"].tolist()
    except Exception as e:
        logger.error("Preprocessing failed", exc_info=True)
        return []


In [5]:
documents = preprocess_dataframe(df)
logger.info(f"Prepared {len(documents)} documents")

In [6]:
# BM25 Index
tokenized_docs = [doc.split() for doc in documents]
bm25 = BM25Okapi(tokenized_docs)


In [7]:
def bm25_search(query, top_k=5):
    tokens = query.lower().split()
    scores = bm25.get_scores(tokens)
    top_idx = np.argsort(scores)[::-1][:top_k]
    return [(documents[i], scores[i]) for i in top_idx]


In [8]:
# Embeddings + FAISS
embedder = SentenceTransformer("all-MiniLM-L6-v2")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
doc_embeddings = embedder.encode(
    documents,
    show_progress_bar=True,
    batch_size=64
)


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [10]:
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

logger.info("FAISS index built successfully")


In [11]:
def vector_search(query, top_k=5):
    query_emb = embedder.encode([query])
    distances, indices = index.search(query_emb, top_k)
    return [(documents[i], distances[0][pos]) for pos, i in enumerate(indices[0])]


In [12]:
# Hybrid Search
def hybrid_search(query, top_k=5, alpha=0.6):
    """
    alpha -> weight for BM25
    (1 - alpha) -> weight for vector similarity
    """
    bm25_results = bm25_search(query, top_k)
    vector_results = vector_search(query, top_k)

    combined_scores = {}

    for doc, score in bm25_results:
        combined_scores[doc] = combined_scores.get(doc, 0) + alpha * score

    for doc, dist in vector_results:
        sim_score = 1 / (1 + dist)   # convert distance → similarity
        combined_scores[doc] = combined_scores.get(doc, 0) + (1 - alpha) * sim_score

    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked[:top_k]


In [13]:
# Generator
def generate_answer(query, retrieved_docs):
    if not retrieved_docs:
        return "No relevant research papers found."

    context = "\n\n".join([doc[:500] for doc, _ in retrieved_docs])

    answer = f"""
Question:
{query}

Retrieved Research Context:
{context}

Answer:
Based on the retrieved arXiv AI research papers, the topic mainly discusses the above themes and findings.
"""
    return answer.strip()


In [14]:
# Full RAG Pipeline
def rag_pipeline(query):
    try:
        logger.info(f"Processing query: {query}")
        retrieved_docs = hybrid_search(query)
        answer = generate_answer(query, retrieved_docs)
        logger.info("RAG pipeline completed successfully")
        return answer
    except Exception as e:
        logger.error("RAG pipeline failed", exc_info=True)
        return "An error occurred while processing your query."


In [15]:
# Test Queries
test_queries = [
    "Recent advances in transformer models",
    "Reinforcement learning for robotics",
    "Explain self supervised learning methods",
    "Challenges in large language model evaluation",
    "Ethical issues in artificial intelligence research"
]

for q in test_queries:
    print("="*100)
    print(rag_pipeline(q))


Question:
Recent advances in transformer models

Retrieved Research Context:
leveraging skill-to-skill supervision for knowledge tracing. knowledge tracing plays a pivotal role in intelligent tutoring systems. this
task aims to predict the probability of students answering correctly to
specific questions. to do so, knowledge tracing systems should trace the
knowledge state of the students by utilizing their problem-solving history and
knowledge about the problems. recent advances in knowledge tracing models have
enabled better exploitation of problem solving history. how

stress test evaluation of transformer-based models in natural language understanding tasks. there has been significant progress in recent years in the field of natural
language processing thanks to the introduction of the transformer architecture.
current state-of-the-art models, via a large number of parameters and
pre-training on massive text corpus, have shown impressive results on several
downstream tasks. many re

## Add HuggingFace LLM

In [16]:
from transformers import pipeline

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=256
)

def generate_llm_answer(query, retrieved_docs):
    context = " ".join([doc[:300] for doc, _ in retrieved_docs])
    prompt = f"""
    Answer the question using the context.

    Context:
    {context}

    Question:
    {query}
    """
    return llm(prompt)[0]["generated_text"]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [17]:
# Updating RAG pipeline to use LLM
def rag_pipeline(query):
    retrieved_docs = hybrid_search(query)
    answer = generate_llm_answer(query, retrieved_docs)
    return answer

In [18]:
print(rag_pipeline("What are transformer models?"))


transformer encoder-based architecture with syntactical knowledge encoded for intent detection and slot filling


In [19]:
#Evaluation Cell
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    return len(set(retrieved_k) & set(relevant)) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    return len(set(retrieved_k) & set(relevant)) / len(relevant)

In [20]:
query = "transformer models"

relevant_docs = [
    doc for doc in documents if "transformer" in doc
][:10]

retrieved_docs = [doc for doc, _ in hybrid_search(query, top_k=10)]

print("Precision@5:", precision_at_k(retrieved_docs, relevant_docs, 5))
print("Recall@5:", recall_at_k(retrieved_docs, relevant_docs, 5))

Precision@5: 0.6
Recall@5: 0.3
