In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
pip install requests beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [6]:
import requests
from bs4 import BeautifulSoup

base_url = "https://www.overleaf.com"
learn_url = base_url + "/learn"

# Request main Learn page
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(learn_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Find guide links
links = []
for a in soup.select("a[href^='/learn/']"):  # All internal /learn/ links
    full_link = base_url + a["href"]
    links.append(full_link)

print(f"✅ Found {len(links)} Overleaf guides!")
print(links[:5])  # Print a few sample links


✅ Found 151 Overleaf guides!
['https://www.overleaf.com/learn/latex/Learn_LaTeX_in_30_minutes', 'https://www.overleaf.com/learn/latex/Creating_a_document_in_LaTeX', 'https://www.overleaf.com/learn/latex/Paragraphs_and_new_lines', 'https://www.overleaf.com/learn/latex/Bold%2C_italics_and_underlining', 'https://www.overleaf.com/learn/latex/Lists']


In [7]:
import json
from tqdm import tqdm

def scrape_guide(url):
    """Scrape headings, paragraphs, lists, and code blocks from a guide page."""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    content_div = soup.find("div", {"class": "mw-parser-output"})
    if not content_div:
        return None

    guide_data = {"title": soup.find("h1").get_text(strip=True), "url": url, "sections": []}
    current_section = None

    for element in content_div.find_all(["h2", "h3", "p", "ul", "ol", "pre"]):
        if element.name in ["h2", "h3"]:  # New section
            current_section = {"heading": element.get_text(strip=True), "content": []}
            guide_data["sections"].append(current_section)
        elif element.name == "p":
            if current_section:
                current_section["content"].append({"type": "text", "data": element.get_text(strip=True)})
        elif element.name in ["ul", "ol"]:
            if current_section:
                items = [li.get_text(strip=True) for li in element.find_all("li")]
                current_section["content"].append({"type": "list", "data": items})
        elif element.name == "pre":  # Code block
            if current_section:
                current_section["content"].append({"type": "code", "data": element.get_text(strip=True)})

    return guide_data

# Scrape all guides
all_guides = []
for link in tqdm(links, desc="Scraping Overleaf Guides"):
    guide_data = scrape_guide(link)
    if guide_data:
        all_guides.append(guide_data)

# Save JSON
with open("overleaf_guides.json", "w", encoding="utf-8") as f:
    json.dump(all_guides, f, ensure_ascii=False, indent=4)

print(f"✅ Scraped {len(all_guides)} guides and saved to overleaf_guides.json")


Scraping Overleaf Guides: 100%|██████████| 151/151 [00:53<00:00,  2.83it/s]

✅ Scraped 108 guides and saved to overleaf_guides.json





In [9]:
import pandas as pd
df=pd.read_json('/kaggle/working/overleaf_guides.json')

In [10]:
df.head()

Unnamed: 0,title,url,sections
0,Learn LaTeX in 30 minutes,https://www.overleaf.com/learn/latex/Learn_LaT...,"[{'heading': 'Contents', 'content': [{'type': ..."
1,Learn LaTeX in 30 minutes,https://www.overleaf.com/learn/latex/Creating_...,"[{'heading': 'Contents', 'content': [{'type': ..."
2,Paragraphs and new lines,https://www.overleaf.com/learn/latex/Paragraph...,"[{'heading': 'Contents', 'content': [{'type': ..."
3,"Bold, italics and underlining",https://www.overleaf.com/learn/latex/Bold%2C_i...,"[{'heading': 'Contents', 'content': [{'type': ..."
4,Lists,https://www.overleaf.com/learn/latex/Lists,"[{'heading': 'Contents', 'content': [{'type': ..."


In [11]:
pip install faiss-cpu openai tiktoken requests beautifulsoup4 tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.45 (from langchain-community)
  Downloading langchain_core-0.3.50-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.21 (from langchain-community)
  Downloading langchain-0.3.22-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain<1.0.0,>=0.3.21->langchain-community)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting async-timeout<6.0,>=4.0 (from aiohttp<4.0.0,>=3.8.3->langchain-community)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting python-

In [22]:
import json

# Load the scraped data
with open("overleaf_guides.json", "r", encoding="utf-8") as f:
    guides = json.load(f)

print(f"✅ Loaded {len(guides)} guides.")


✅ Loaded 108 guides.


In [23]:
import tiktoken

def chunk_text(text, max_tokens=512):
    """Split text into smaller chunks based on token length."""
    tokenizer = tiktoken.get_encoding("cl100k_base")  # OpenAI tokenizer
    tokens = tokenizer.encode(text)

    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i+max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    
    return chunks

document_chunks = []
for guide in guides:
    for section in guide["sections"]:
        # Ensure data is always a string
        full_text = section["heading"] + "\n" + "\n".join(
            [ " ".join(c["data"]) if isinstance(c["data"], list) else c["data"] for c in section["content"] ]
        )
        chunks = chunk_text(full_text)

        for chunk in chunks:
            document_chunks.append({
                "text": chunk,
                "title": guide["title"],
                "url": guide["url"]
            })

print(f"✅ Created {len(document_chunks)} text chunks for FAISS.")


✅ Created 1311 text chunks for FAISS.


In [25]:
import json
import faiss
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings


# Step 2: Load HuggingFace Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# Step 3: Generate embeddings for each chunk
texts = [chunk["text"] for chunk in document_chunks]  # Extract only text
embeddings = embedding_model.embed_documents(texts)

# Step 4: Convert embeddings into NumPy array
embeddings_array = np.array(embeddings).astype('float32')

# Step 5: Create FAISS index
d = embeddings_array.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)
index.add(embeddings_array)

# Step 6: Save FAISS index
faiss.write_index(index, "faiss_index.bin")

print(f"✅ Stored {len(document_chunks)} text chunks in FAISS.")


✅ Stored 1311 text chunks in FAISS.


In [27]:
def search_faiss(query, k=3):
    """Search FAISS index for the most relevant chunks."""
    
    # Step 1: Generate embedding for the query
    query_embedding = embedding_model.embed_query(query)  # Correct embedding method
    query_embedding = np.array(query_embedding).astype("float32").reshape(1, -1)

    # Step 2: Perform FAISS search
    distances, indices = index.search(query_embedding, k)

    # Step 3: Retrieve results
    results = []
    for idx in indices[0]:
        if idx != -1:  # Ensure valid index
            results.append(document_chunks[idx])  # Retrieve metadata

    return results

# Test search
query = "How to create a project in Overleaf?"
results = search_faiss(query)

# Display search results
print("\n🔍 Search Results:")
for i, res in enumerate(results):
    print(f"{i+1}. {res['title']} ({res['url']})\n{res['text'][:200]}...\n")



🔍 Search Results:
1. Typesetting exams in LaTeX (https://www.overleaf.com/learn/latex/Typesetting_exams_in_LaTeX)
Overleafexamproject example
The above examples have been combined to create a basic project template that you can use as a starting point for your work. You can create a new project by clicking/select...

2. Creating a project from a template (https://www.overleaf.com/learn/how-to/Creating_a_project_from_a_template)
Introduction
To start using Overleaf go towww.overleaf.com.
If you don't have an account enter your e-mail address and set a password, clickRegisterand that's it, you will be redirected to the project...

3. Creating a project from a template (https://www.overleaf.com/learn/how-to/Creating_a_project_from_a_template)
Introduction
To start using Overleaf go towww.overleaf.com.
If you don't have an account enter your e-mail address and set a password, clickRegisterand that's it, you will be redirected to the project...

