In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install requests beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup

base_url = "https://www.overleaf.com"
learn_url = base_url + "/learn"

# Request main Learn page
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(learn_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Find guide links
links = []
for a in soup.select("a[href^='/learn/']"):  # All internal /learn/ links
    full_link = base_url + a["href"]
    links.append(full_link)

print(f"✅ Found {len(links)} Overleaf guides!")
print(links[:5])  # Print a few sample links


✅ Found 151 Overleaf guides!
['https://www.overleaf.com/learn/latex/Learn_LaTeX_in_30_minutes', 'https://www.overleaf.com/learn/latex/Creating_a_document_in_LaTeX', 'https://www.overleaf.com/learn/latex/Paragraphs_and_new_lines', 'https://www.overleaf.com/learn/latex/Bold%2C_italics_and_underlining', 'https://www.overleaf.com/learn/latex/Lists']


In [4]:
import json
from tqdm import tqdm

def scrape_guide(url):
    """Scrape headings, paragraphs, lists, and code blocks from a guide page."""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    content_div = soup.find("div", {"class": "mw-parser-output"})
    if not content_div:
        return None

    guide_data = {"title": soup.find("h1").get_text(strip=True), "url": url, "sections": []}
    current_section = None

    for element in content_div.find_all(["h2", "h3", "p", "ul", "ol", "pre"]):
        if element.name in ["h2", "h3"]:  # New section
            current_section = {"heading": element.get_text(strip=True), "content": []}
            guide_data["sections"].append(current_section)
        elif element.name == "p":
            if current_section:
                current_section["content"].append({"type": "text", "data": element.get_text(strip=True)})
        elif element.name in ["ul", "ol"]:
            if current_section:
                items = [li.get_text(strip=True) for li in element.find_all("li")]
                current_section["content"].append({"type": "list", "data": items})
        elif element.name == "pre":  # Code block
            if current_section:
                current_section["content"].append({"type": "code", "data": element.get_text(strip=True)})

    return guide_data

# Scrape all guides
all_guides = []
for link in tqdm(links, desc="Scraping Overleaf Guides"):
    guide_data = scrape_guide(link)
    if guide_data:
        all_guides.append(guide_data)

# Save JSON
with open("overleaf_guides.json", "w", encoding="utf-8") as f:
    json.dump(all_guides, f, ensure_ascii=False, indent=4)

print(f"✅ Scraped {len(all_guides)} guides and saved to overleaf_guides.json")


Scraping Overleaf Guides: 100%|██████████| 151/151 [00:29<00:00,  5.10it/s]

✅ Scraped 53 guides and saved to overleaf_guides.json





In [5]:
import pandas as pd
df=pd.read_json('/kaggle/working/overleaf_guides.json')

In [6]:
pip install faiss-cpu openai tiktoken requests beautifulsoup4 tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain-community)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain-community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain<1.0.0,>=0.3.23->langchain-community)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting async-timeout<6.0,>=4.0 (from aiohttp<4.0.0,>=3.8.3->langchain-community)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting python-

In [8]:
import json

# Load the scraped data
with open("overleaf_guides.json", "r", encoding="utf-8") as f:
    guides = json.load(f)

print(f"✅ Loaded {len(guides)} guides.")


✅ Loaded 53 guides.


Chuking(with overlapping)

In [9]:
import tiktoken

def chunk_text(text, max_tokens=512, overlap=50):
    """Split text into overlapping chunks based on token length."""
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)

    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
        
        # Move start pointer forward by (max_tokens - overlap)
        start += max_tokens - overlap
    
    return chunks


document_chunks = []
for guide in guides:
    for section in guide["sections"]:
        # Ensure data is always a string
        full_text = section["heading"] + "\n" + "\n".join(
            [ " ".join(c["data"]) if isinstance(c["data"], list) else c["data"] for c in section["content"] ]
        )
        chunks = chunk_text(full_text)

        for chunk in chunks:
            document_chunks.append({
                "text": chunk,
                "title": guide["title"],
                "url": guide["url"]
            })

print(f"✅ Created {len(document_chunks)} text chunks for FAISS.")


✅ Created 635 text chunks for FAISS.


embedding

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight & fast

texts = [doc["text"] for doc in document_chunks]
embeddings = model.encode(texts, show_progress_bar=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [16]:
embeddings = np.array(embeddings).astype("float32")

Vector DB

In [18]:
import tiktoken
from sentence_transformers import SentenceTransformer
import faiss

In [19]:
# Step 3: Create FAISS Index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print("✅ FAISS index created and populated.")


✅ FAISS index created and populated.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🔍 Top matching chunks:

📘 Using the Overleaf project menu - https://www.overleaf.com/learn/how-to/Using_the_Overleaf_project_menu
Help

------------------------------------------------------------
📘 Using the Overleaf project menu - https://www.overleaf.com/learn/how-to/Using_the_Overleaf_project_menu
Help

------------------------------------------------------------
📘 Creating a project from a template - https://www.overleaf.com/learn/how-to/Creating_a_project_from_a_template
Introduction
To start using Overleaf go towww.overleaf.com.
If you don't have an account enter your e-mail address and set a password, clickRegisterand that's it, you will be redirected to the project management page where you will be guided into how to create a new project.

If you already have an account, clickLoginin the upper right corner, then type in your email and password and click theLoginbutton.
Once you are logged in, you should see the Overleaf Project Management page.
-------------------------------

In [22]:

# Step 4: Search
query = "How to insert an image in latex?"
query_embedding = model.encode([query]).astype("float32")
D, I = index.search(query_embedding, k=5)

# Step 5: Display results
print("\n🔍 Top matching chunks:\n")
for i in I[0]:
    doc = document_chunks[i]
    print(f"📘 {doc['title']} - {doc['url']}\n{doc['text'][:500]}\n{'-'*60}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🔍 Top matching chunks:

📘 Creating a document in Overleaf - https://www.overleaf.com/learn/how-to/Creating_a_document_in_Overleaf
Further reading
Creating a document in LaTeX Inserting Images Bibliography management in LaTeX
------------------------------------------------------------
📘 Creating a document in Overleaf - https://www.overleaf.com/learn/how-to/Creating_a_document_in_Overleaf
Further reading
Creating a document in LaTeX Inserting Images Bibliography management in LaTeX
------------------------------------------------------------
📘 Including images on Overleaf - https://www.overleaf.com/learn/how-to/Including_images_on_Overleaf
Simple image upload
To upload an image, in the editor go to the upper left corner and click the upload icon
a dialogue box will pop up for you to upload your files
there you can either drag and drop your files or clickSelect files(s)to open a file browser
navigate to the right folder and select the images to upload. You can upload several files at o