In [12]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")


In [13]:
from google import genai
import chromadb.utils.embedding_functions as embedding_functions
import chromadb


chroma_client = chromadb.PersistentClient(path="db/")
google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=api_key)
client = genai.Client(api_key=api_key)


In [14]:
collection = chroma_client.get_or_create_collection(name="mkdocsGPT", embedding_function=google_ef)

In [23]:
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

# splitter = RecursiveCharacterTextSplitter(
#     separators=["\n\n", "\n", " ", ""],
#     chunk_size=500,
#     chunk_overlap=50
# )

In [24]:
import os
import pickle
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from tqdm import tqdm

path = "mkdocs/docs/user-guide"

all_texts = []
all_metadatas = []
all_ids = []

# loader = DirectoryLoader(
#     path,
#     glob="**/*.md",
#     loader_cls=TextLoader,
#     loader_kwargs={"encoding": "utf-8"}
# )

# raw_docs = loader.load()
# print(f"Loaded {len(raw_docs)} files from User Guide.")

for md_file in tqdm(os.listdir(path), desc="Loading Markdown files"):
    if not md_file.endswith(".md"):
        continue
    loader = TextLoader(f"{path}/{md_file}", encoding="utf-8")
    docs = loader.load()
    chunks = splitter.split_text(docs[0].page_content)

    all_texts.extend([c.page_content for c in chunks])

    for c in chunks:
        meta = c.metadata.copy() 
        meta["source"] = md_file
        all_metadatas.append(meta)

    all_ids.extend([
        f"{os.path.splitext(md_file)[0]}-c{i}"
        for i, _ in enumerate(chunks)
    ])

# Save to file
with open("split_data.pkl", "wb") as f:
    pickle.dump((all_texts, all_metadatas, all_ids), f)

print("✅ Data saved to split_data.pkl")

Loading Markdown files: 100%|██████████| 9/9 [00:00<00:00, 966.18it/s]

✅ Data saved to split_data.pkl



