In [1]:
import os, glob, hashlib
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

PERSIST_DIR = "./index/chroma"
COLLECTION = "artificial_intelligence"
EMBED = "intfloat/multilingual-e5-small"  # good for Finnish
CORPUS_DIR = "./corpus"

def load_docs(corpus_dir=CORPUS_DIR):
    paths = glob.glob(os.path.join(corpus_dir, "**", "*"), recursive=True)
    docs = []
    for p in paths:
        pl = p.lower()
        if pl.endswith(".pdf"):
            docs += PyPDFLoader(p).load()
        elif pl.endswith((".txt", ".md")):
            docs += TextLoader(p, encoding="utf-8").load()
    return docs

def make_id(text: str) -> str:
    # stable chunk id to avoid duplicates
    return hashlib.sha1(text.strip().encode("utf-8")).hexdigest()[:16]

def main():
    raw_docs = load_docs()
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, chunk_overlap=120
    )
    chunks = splitter.split_documents(raw_docs)

    emb = HuggingFaceEmbeddings(model_name=EMBED)

    vs = Chroma(
        collection_name=COLLECTION,
        persist_directory=PERSIST_DIR,
        embedding_function=emb,
    )

    texts, metas, ids = [], [], []
    for d in chunks:
        txt = d.page_content
        src = d.metadata.get("source", "unknown")
        page = d.metadata.get("page", None)
        cid = make_id(f"{src}-{page}-{txt[:200]}")
        texts.append(txt)
        metas.append({"source": src, "page": page})
        ids.append(cid)

    # upsert is idempotent if ids repeat
    vs.add_texts(texts=texts, metadatas=metas, ids=ids)

    # make sure it’s persisted
    vs.persist()
    print(f"Indexed {len(texts)} chunks into '{COLLECTION}' at {PERSIST_DIR}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
  emb = HuggingFaceEmbeddings(model_name=EMBED)
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip in

Indexed 508 chunks into 'artificial_intelligence' at ./index/chroma


  vs.persist()


['q', 'w', 'e', 'r']
