In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/overleaf_guides(151).json


**Intsalling and Importing Libraries**

In [2]:
!pip install --upgrade langchain langchain-community
!pip install langchain langchain-community faiss-cpu sentence-transformers



In [3]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")
import textwrap

2025-04-20 18:16:05.186231: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745172965.209363     149 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745172965.216265     149 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**Loading Web-Scraped Data**

In [4]:
import json

# Load the scraped data
with open("/kaggle/input/dataset/overleaf_guides(151).json", "r", encoding="utf-8") as f:
    guides = json.load(f)

print(f"✅ Loaded {len(guides)} guides.")


✅ Loaded 151 guides.


**Chunking Documents**

In [5]:
def chunk_text(text, max_words=100, overlap=30):
    """
    Splits text into overlapping chunks based on word count.
    
    Args:
        text (str): The input text to chunk.
        max_words (int): Maximum number of words per chunk.
        overlap (int): Number of overlapping words between chunks.
    
    Returns:
        List[str]: A list of text chunks.
    """
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = min(start + max_words, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += max_words - overlap

    return chunks

**BM25 Retreival**

In [6]:
!pip install rank_bm25



In [7]:
from rank_bm25 import BM25Okapi

def bm25_search(query, document_chunks, top_k=10):
    """
    Performs BM25 search over text chunks.
    
    Args:
        query (str): Search query string.
        document_chunks (list): List of dicts with 'text', 'title', 'url'.
        top_k (int): Number of top results to return.

    Returns:
        List of top_k matched chunks with scores.
    """
    corpus = [chunk["text"] for chunk in document_chunks]
    tokenized_corpus = [doc.split() for doc in corpus]

    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)

    import numpy as np
    top_n = np.argsort(scores)[::-1][:top_k]

    results = []
    for idx in top_n:
        results.append({
            "score": scores[idx],
            "text": document_chunks[idx]["text"],
            "title": document_chunks[idx]["title"],
            "url": document_chunks[idx]["url"]
        })

    return results


**Semantic Search using FAISS and Transformer Embeddings**

In [8]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def build_dense_index(document_chunks, model_name='all-MiniLM-L6-v2'):
    """
    Builds FAISS index for dense retrieval.
    
    Returns:
        index, embeddings, model (for later querying)
    """
    model = SentenceTransformer(model_name)
    texts = [chunk["text"] for chunk in document_chunks]
    embeddings = model.encode(texts, show_progress_bar=True)
    
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))

    return index, embeddings, model

def dense_search(query, index, model, document_chunks, top_k=5):
    """
    Searches top_k relevant chunks using vector similarity.
    """
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)

    results = []
    for idx in I[0]:
        results.append({
            "text": document_chunks[idx]["text"],
            "title": document_chunks[idx]["title"],
            "url": document_chunks[idx]["url"]
        })

    return results


**Semantic Search using FAISS and Doc2Vec Embeddings**

In [9]:
!pip install gensim



In [10]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


In [11]:
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def build_dense_index_doc2vec(document_chunks, vector_size=100, epochs=40):
    """
    Trains a Doc2Vec model on the document chunks and builds a FAISS index.
    
    Returns:
        index, model, document_chunks
    """
    tagged_data = [TaggedDocument(words=chunk["text"].split(), tags=[str(i)]) 
                   for i, chunk in enumerate(document_chunks)]

    model = Doc2Vec(vector_size=vector_size, window=5, min_count=1, workers=4, epochs=epochs)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

    embeddings = np.array([model.dv[str(i)] for i in range(len(document_chunks))])

    index = faiss.IndexFlatL2(vector_size)
    index.add(embeddings)

    return index, model

def dense_search_doc2vec(query, index, model, document_chunks, top_k=5):
    """
    Uses a trained Doc2Vec model to infer vector and search FAISS index.
    """
    query_vector = model.infer_vector(query.split()).reshape(1, -1)
    D, I = index.search(query_vector, top_k)

    results = []
    for idx in I[0]:
        results.append({
            "text": document_chunks[idx]["text"],
            "title": document_chunks[idx]["title"],
            "url": document_chunks[idx]["url"]
        })

    return results


**Looading Model and Tokenizer**

In [12]:
import torch
model_id = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="cuda:0")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

**Prompting Query combined with Retreived Context to generate final RAG pipeline response**

In [13]:
def generate_answer_huggingface(query, context_chunks):
    """Generates an answer using Hugging Face LLM with tokenization, generation, and decoding."""
    
    # Combine context chunks into one string
    context = "\n\n".join([c["text"] for c in context_chunks])
    
    # Construct the prompt with context and question
    prompt = f"Answer the following based on context:\n\nContext: {context}\n\nQuestion: {query}"

    # Tokenize the prompt with proper truncation and padding
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(model.device)
    
    # Generate the output using the model
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False,  # Use deterministic output
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id  # Set pad token id to eos token
    )
    
    # Decode the generated tokens
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response.strip()

In [14]:
from huggingface_hub import login

# Use your actual token string here
login(token="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")


**RAG Pipeline**

In [15]:
def run_rag_pipeline(guides, query, retrieval="bm25", llm="huggingface", top_k=3):
    # Step 1: Chunking
    document_chunks = []
    for guide in guides:
        full_text = ""
        for block in guide["content"]:
            full_text += "\n".join(block["data"]) + "\n" if isinstance(block["data"], list) else block["data"] + "\n"
        chunks = chunk_text(full_text)
        for chunk in chunks:
            document_chunks.append({
                "text": chunk,
                "title": guide["title"],
                "url": guide["url"]
            })

    # Step 2: Retrieval (BM25, FAISS, Doc2Vec, etc.)
    if retrieval == "faiss":
        index, _, model = build_dense_index(document_chunks)
        context_chunks = dense_search(query, index, model, document_chunks, top_k=top_k)
    elif retrieval == "bm25":
        context_chunks = bm25_search(query, document_chunks, top_k=top_k)
    elif retrieval == "dense_simple":
        context_chunks = dense_search_no_faiss(query, document_chunks, top_k=top_k)
    elif retrieval == "doc2vec":
        index, model = build_dense_index_doc2vec(document_chunks)
        context_chunks = dense_search_doc2vec(query, index, model, document_chunks, top_k=top_k)
    else:
        raise ValueError(f"Unsupported retrieval method: {retrieval}")

    # 🔍 Print retrieved chunks before LLM call
    print("\n🔍 Retrieved Context Chunks:")
    for i, chunk in enumerate(context_chunks):
        print(f"\n--- Chunk {i+1} ---")
        print(f"Title: {chunk['title']}")
        print(f"URL: {chunk['url']}")
        print(f"Text:\n{chunk['text'][:500]}...")  # show first 500 chars max for brevity

    # Step 3: LLM Answer Generation
    answer = generate_answer_huggingface(query, context_chunks)

    return answer


**Testing Query**

In [16]:
query = "How to add an image in latex?"

# Run the pipeline with BM25 search
answer = run_rag_pipeline(guides, query, retrieval="bm25", llm="huggingface", top_k=6)

print(f"Answer: {answer}")


🔍 Retrieved Context Chunks:

--- Chunk 1 ---
Title: Fixing and preventing compile timeouts
URL: https://www.overleaf.com/learn/how-to/Why_do_I_keep_getting_the_compile_timeout_error_message%3F
Text:
2 but not in version 3. The use of EPS or SVG images can require extra processing to convert them to PDF format. This extra processing will add to the time needed to compile your project. While thelatexcompiler will support EPS images directly, thepdfLaTeXcompiler does not support EPS images, so an extra step is required to convert these to pdf images when that compiler is used. The processing is handled by theepstopdfpackage which usesGhostscriptto convert the EPS files to PDF. This conversion ...

--- Chunk 2 ---
Title: Fixing and preventing compile timeouts
URL: https://www.overleaf.com/learn/how-to/Debugging_Compilation_timeout_errors
Text:
2 but not in version 3. The use of EPS or SVG images can require extra processing to convert them to PDF format. This extra processing will add to 

**Computing Relevancy Score**

In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch

def compute_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        # Get the model outputs
        outputs = model(**inputs, return_dict=True)  # Get logits for generative models
        embeddings = outputs.logits.mean(dim=1)  # Using the mean of the logits for embeddings
    
    return embeddings.cpu().numpy()

# Function to generate artificial questions based on the response using the model
def generate_questions(response, model, tokenizer, num_questions=3):
    prompt = f"Generate {num_questions} questions based on the following response:\n{response}"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    
    # Generate text (questions)
    outputs = model.generate(inputs["input_ids"], max_new_tokens=100, num_return_sequences=num_questions, do_sample=True, temperature=0.7)
    
    # Decode generated questions
    questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return questions

# Function to calculate the relevancy score from the query and response
def calculate_answer_relevancy_from_query_and_response(query, response):
    # Generate artificial questions based on the response using the model
    generated_questions = generate_questions(response, model, tokenizer)
    
    # Compute embeddings for the query and each generated question
    query_embedding = compute_embeddings(query, tokenizer, model)
    question_embeddings = np.array([compute_embeddings(q, tokenizer, model) for q in generated_questions])
    
    # Compute cosine similarity between the query and each generated question
    similarities = [cosine_similarity(query_embedding, question_embedding)[0][0] for question_embedding in question_embeddings]
    
    # Return the average cosine similarity as the relevancy score
    return np.mean(similarities)

# Example usage:


# Calculate the relevancy score
relevancy_score = calculate_answer_relevancy_from_query_and_response(query, answer)

print(f"Answer Relevancy Score: {relevancy_score}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Answer Relevancy Score: 0.9440719634897254
