In [1]:
# Install dependencies
!pip install -q faiss-cpu rank-bm25 sentence-transformers pandas numpy


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Imports & Logging
import pandas as pd
import numpy as np
import logging
import faiss
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

In [3]:
import pandas as pd
import logging

# Logging Setup
from src.logging_config import setup_logging
setup_logging()
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger("RAG")

#Load Kaggle Dataset
DATA_PATH = "/content/arxiv_ai.csv"

try:
    df = pd.read_csv(DATA_PATH, engine='python', on_bad_lines='skip')
    logger.info(f"Dataset loaded with {len(df)} rows")
except Exception as e:
    logger.error("Failed to load dataset", exc_info=True)

ModuleNotFoundError: No module named 'src'

In [None]:
# Data Preprocessing
def preprocess_dataframe(df):
    try:
        df = df.fillna("")
        df["document"] = (
            df["title"].str.lower() + ". " +
            df["summary"].str.lower() + ". " +
            "categories: " + df["categories"].str.lower()
        )
        return df["document"].tolist()
    except Exception as e:
        logger.error("Preprocessing failed", exc_info=True)
        return []


In [None]:
documents = preprocess_dataframe(df)
logger.info(f"Prepared {len(documents)} documents")

In [None]:
# BM25 Index
tokenized_docs = [doc.split() for doc in documents]
bm25 = BM25Okapi(tokenized_docs)


In [None]:
def bm25_search(query, top_k=5):
    tokens = query.lower().split()
    scores = bm25.get_scores(tokens)
    top_idx = np.argsort(scores)[::-1][:top_k]
    return [(documents[i], scores[i]) for i in top_idx]


In [None]:
# Embeddings + FAISS
embedder = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
doc_embeddings = embedder.encode(
    documents,
    show_progress_bar=True,
    batch_size=64
)


In [None]:
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

logger.info("FAISS index built successfully")


In [None]:
def vector_search(query, top_k=5):
    query_emb = embedder.encode([query])
    distances, indices = index.search(query_emb, top_k)
    return [(documents[i], distances[0][pos]) for pos, i in enumerate(indices[0])]


In [None]:
# Hybrid Search
def hybrid_search(query, top_k=5, alpha=0.6):
    """
    alpha -> weight for BM25
    (1 - alpha) -> weight for vector similarity
    """
    bm25_results = bm25_search(query, top_k)
    vector_results = vector_search(query, top_k)

    combined_scores = {}

    for doc, score in bm25_results:
        combined_scores[doc] = combined_scores.get(doc, 0) + alpha * score

    for doc, dist in vector_results:
        sim_score = 1 / (1 + dist)   # convert distance → similarity
        combined_scores[doc] = combined_scores.get(doc, 0) + (1 - alpha) * sim_score

    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked[:top_k]


In [None]:
# Generator
def generate_answer(query, retrieved_docs):
    if not retrieved_docs:
        return "No relevant research papers found."

    context = "\n\n".join([doc[:500] for doc, _ in retrieved_docs])

    answer = f"""
Question:
{query}

Retrieved Research Context:
{context}

Answer:
Based on the retrieved arXiv AI research papers, the topic mainly discusses the above themes and findings.
"""
    return answer.strip()


In [None]:
# Full RAG Pipeline
def rag_pipeline(query):
    try:
        logger.info(f"Processing query: {query}")
        retrieved_docs = hybrid_search(query)
        answer = generate_answer(query, retrieved_docs)
        logger.info("RAG pipeline completed successfully")
        return answer
    except Exception as e:
        logger.error("RAG pipeline failed", exc_info=True)
        return "An error occurred while processing your query."


In [None]:
# Test Queries
test_queries = [
    "Recent advances in transformer models",
    "Reinforcement learning for robotics",
    "Explain self supervised learning methods",
    "Challenges in large language model evaluation",
    "Ethical issues in artificial intelligence research"
]

for q in test_queries:
    print("="*100)
    print(rag_pipeline(q))


## Add HuggingFace LLM

In [None]:
from transformers import pipeline

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=256
)

def generate_llm_answer(query, retrieved_docs):
    context = " ".join([doc[:300] for doc, _ in retrieved_docs])
    prompt = f"""
    Answer the question using the context.

    Context:
    {context}

    Question:
    {query}
    """
    return llm(prompt)[0]["generated_text"]

In [None]:
# Updating RAG pipeline to use LLM
def rag_pipeline(query):
    retrieved_docs = hybrid_search(query)
    answer = generate_llm_answer(query, retrieved_docs)
    return answer

In [None]:
print(rag_pipeline("What are transformer models?"))


In [None]:
#Evaluation Cell
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    return len(set(retrieved_k) & set(relevant)) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    return len(set(retrieved_k) & set(relevant)) / len(relevant)

In [None]:
query = "transformer models"

relevant_docs = [
    doc for doc in documents if "transformer" in doc
][:10]

retrieved_docs = [doc for doc, _ in hybrid_search(query, top_k=10)]

print("Precision@5:", precision_at_k(retrieved_docs, relevant_docs, 5))
print("Recall@5:", recall_at_k(retrieved_docs, relevant_docs, 5))

Then in notebook and main.py:

from src.logging_config import setup_logging
setup_logging()