In [50]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import os
from dotenv import load_dotenv
from langchain_core.messages import SystemMessage, HumanMessage


In [51]:
import os
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader

# Path to your local knowledge base
KNOWLEDGE_BASE_PATH = "knowledge-base"

# Find all folders inside the knowledge base
folders = glob.glob(os.path.join(KNOWLEDGE_BASE_PATH, "*"))

documents = []

# Loop through each folder and load Markdown files
for folder in folders:
    if not os.path.isdir(folder):
        continue  # skip files, only process folders

    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(
        folder,
        glob="**/*.md",              # load all markdown files recursively
        loader_cls=TextLoader,       # use simple text loader
        loader_kwargs={"encoding": "utf-8"},
    )

    folder_docs = loader.load()

    # Tag each document with folder name as doc_type
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")

Loaded 76 documents


In [52]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1_000,
    chunk_overlap=200,
)
chunks = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vector_store = Chroma(
    persist_directory=DB_NAME,
    embedding_function=embeddings,
)

retriever = vector_store.as_retriever()

In [54]:


llm = ChatAnthropic(
    model="claude-haiku-4-5",
    temperature=0.2,
)

SYSTEM_PROMPT_TEMPLATE = """
You are a knowledgeable, friendly assistant representing the company Insurellm.
You are chatting with a user about Insurellm.

Use ONLY the provided context to answer the question.
If the answer is not present in the context, say: "I don't know based on the provided context."

Context:
{context}
"""
QUERY = "Who is Vice President of Sales."
retrieved_docs = retriever.invoke(QUERY)
context = "\n\n".join(doc.page_content for doc in retrieved_docs)

system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)

messages = [
    SystemMessage(content=system_prompt.strip()),
    HumanMessage(content=QUERY),
]

response = llm.invoke(messages)

print("\nTop retrieved document metadata:")
print(retrieved_docs[0].metadata if retrieved_docs else "No results found")

print("\nLLM response:")
print(response.content)


Top retrieved document metadata:
{'source': "/content/drive/MyDrive/knowledge-base/employees/Michael O'Brien.md", 'doc_type': 'employees'}

LLM response:
Based on the provided context, **Sarah Chen** is the Vice President of Sales at Insurellm, Inc.
