# Setup gemini API key and ChromaDB

In [12]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")


In [13]:
from google import genai
import chromadb.utils.embedding_functions as embedding_functions
import chromadb


chroma_client = chromadb.PersistentClient(path="db/")
google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=api_key)
client = genai.Client(api_key=api_key)


In [14]:
collection = chroma_client.get_or_create_collection(name="mkdocsGPT", embedding_function=google_ef)

# Split the markdown files into chunks

splitting is done on header **REASON**

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)


In [None]:
import os
import pickle
from langchain_community.document_loaders import TextLoader
from tqdm import tqdm

path = "mkdocs/docs/user-guide"

all_texts = []
all_metadatas = []
all_ids = []


for md_file in tqdm(os.listdir(path), desc="Loading Markdown files"):
    if not md_file.endswith(".md"):
        continue
    loader = TextLoader(f"{path}/{md_file}", encoding="utf-8")
    docs = loader.load()
    chunks = splitter.split_text(docs[0].page_content)

    all_texts.extend([c.page_content for c in chunks])

    for c in chunks:
        meta = c.metadata.copy() 
        meta["source"] = md_file
        all_metadatas.append(meta)

    all_ids.extend([
        f"{os.path.splitext(md_file)[0]}-c{i}"
        for i, _ in enumerate(chunks)
    ])

# Save to file
with open("split_data.pkl", "wb") as f:
    pickle.dump((all_texts, all_metadatas, all_ids), f)

print("✅ Data saved to split_data.pkl")

Loading Markdown files: 100%|██████████| 9/9 [00:00<00:00, 966.18it/s]

✅ Data saved to split_data.pkl





# Load chunks

In [25]:
import threading
import pickle

# === Load split data ===
with open("split_data.pkl", "rb") as f:
    all_texts, all_metadatas, all_ids = pickle.load(f)
# === Dynamic thread and batch settings ===
print(f"✅ Loaded {len(all_texts)} chunks.")
NUM_WORKERS = 4
BATCH_SIZE = 30
total = len(all_texts)


✅ Loaded 74 chunks.


In [45]:
all_texts[2], all_metadatas[2], all_ids[2]

("A clone of the default theme used by the [Read the Docs] service, which offers\nthe same restricted feature set as its parent theme. Like its parent theme, only\ntwo levels of navigation are supported.  \n![ReadTheDocs](../img/readthedocs.png)  \nIn addition to the default [theme configuration options][theme], the `readthedocs`\ntheme supports the following options:  \n*   **`highlightjs`**: Enables highlighting of source code in code blocks using\nthe [highlight.js] JavaScript library. Default: `True`.  \n*   **`hljs_languages`**: By default, highlight.js only supports 23 common\nlanguages. List additional languages here to include support for them.  \n```yaml\ntheme:\nname: readthedocs\nhighlightjs: true\nhljs_languages:\n- yaml\n- rust\n```  \n*   **`analytics`**: Defines configuration options for an analytics service.  \n*   **`gtag`**: To enable Google Analytics, set to a Google Analytics v4\ntracking ID, which uses the `G-` format. See Google's documentation to\n[Set up Analyti

# Clean chunks 

In [41]:
import re

def clean_chunk_content(text):
    """
    Applies cleaning rules to Markdown text specifically for MkDocs.
    """
    # 1. Remove YAML Frontmatter (--- ... --- at the start)
    text = re.sub(r'^---\n.*?\n---\n', '', text, flags=re.DOTALL)

    # 2. Normalize Admonitions (MkDocs specific syntax)
    # Converts: !!! note "Important Title" -> **Note: Important Title**
    def replace_admonition(match):
        type_name = match.group(1).capitalize()
        title = match.group(2)
        if title:
            title = title.strip('"\'')
            return f"\n**{type_name}: {title}**\n"
        return f"\n**{type_name}**\n"
    
    text = re.sub(r'!!!\s+(\w+)(?:\s+(".*?"))?', replace_admonition, text)

    # 3. Remove MkDocs Content Tabs (e.g., === "Tab Title")
    # These are structural and often add noise to the embedding
    text = re.sub(r'^[ \t]*===\s+".*?"\s*$', '', text, flags=re.MULTILINE)

    # 4. Simplify Links and Images
    # Images: ![Alt Text](url) -> Alt Text
    text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', text)
    # Links: [Link Text](url) -> Link Text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)

    # 5. Collapse excessive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)

    # 6. Strip leading/trailing whitespace
    text = text.strip()

    return text

In [42]:
cleaned_texts = []

for original_text in all_texts: 
    cleaned_text = clean_chunk_content(original_text)
    
    # Only keep the chunk if it still has content after cleaning
    if len(cleaned_text) > 10: 
        cleaned_texts.append(cleaned_text) # Fixed: Append the CLEANED text

print(f"Cleaning complete.")
print(f"Original chunks: {len(all_texts)}")
print(f"Cleaned chunks:  {len(cleaned_texts)}")

Cleaning complete.
Original chunks: 74
Cleaned chunks:  74


In [44]:
all_texts[2], cleaned_texts[2]

("A clone of the default theme used by the [Read the Docs] service, which offers\nthe same restricted feature set as its parent theme. Like its parent theme, only\ntwo levels of navigation are supported.  \n![ReadTheDocs](../img/readthedocs.png)  \nIn addition to the default [theme configuration options][theme], the `readthedocs`\ntheme supports the following options:  \n*   **`highlightjs`**: Enables highlighting of source code in code blocks using\nthe [highlight.js] JavaScript library. Default: `True`.  \n*   **`hljs_languages`**: By default, highlight.js only supports 23 common\nlanguages. List additional languages here to include support for them.  \n```yaml\ntheme:\nname: readthedocs\nhighlightjs: true\nhljs_languages:\n- yaml\n- rust\n```  \n*   **`analytics`**: Defines configuration options for an analytics service.  \n*   **`gtag`**: To enable Google Analytics, set to a Google Analytics v4\ntracking ID, which uses the `G-` format. See Google's documentation to\n[Set up Analyti