In [31]:
from pathlib import Path
from typing import List, Optional, Tuple

from dotenv import load_dotenv
from pydantic import BaseModel, Field
from chromadb import PersistentClient
from tqdm import tqdm

from langchain_anthropic import ChatAnthropic
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage


load_dotenv(override=True)

MODEL = "claude-haiku-4-5"
DB_NAME = "preprocessed_db"
collection_name = "docs"
embedding_model = "all-MiniLM-L6-v2"
KNOWLEDGE_BASE_PATH = Path("knowledge-base")
AVERAGE_CHUNK_SIZE = 500
RETRIEVAL_K = 10


llm = ChatAnthropic(
    model=MODEL,
    temperature=0,
    max_tokens=5000,
)

In [3]:
class Result(BaseModel):
    """Represents a retrievable chunk plus metadata."""

    page_content: str
    metadata: dict


class Chunk(BaseModel):
    """Represents one generated chunk from an input document."""

    headline: str = Field(
        description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query"
    )
    summary: str = Field(
        description="A few sentences summarizing the content of this chunk to answer common questions"
    )
    original_text: str = Field(
        description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
    )

    def as_result(self, document: dict) -> Result:
        """Convert a generated chunk into a retrievable result object."""
        metadata = {"source": document["source"], "type": document["type"]}
        page_content = f"{self.headline}\n\n{self.summary}\n\n{self.original_text}"
        return Result(page_content=page_content, metadata=metadata)


class Chunks(BaseModel):
    """Container model used for structured chunk extraction."""

    chunks: list[Chunk]


class RankOrder(BaseModel):
    """Represents reranked chunk ids in descending relevance order."""

    order: list[int] = Field(
        description="The order of relevance of chunks, from most relevant to least relevant, by chunk id number"
    )

In [4]:
def fetch_documents() -> list[dict]:
    """Load markdown documents from the knowledge base directory."""
    documents = []

    for folder in KNOWLEDGE_BASE_PATH.iterdir():
        doc_type = folder.name
        for file in folder.rglob("*.md"):
            with open(file, "r", encoding="utf-8") as file_handle:
                documents.append(
                    {
                        "type": doc_type,
                        "source": file.as_posix(),
                        "text": file_handle.read(),
                    }
                )

    print(f"Loaded {len(documents)} documents")
    return documents

In [28]:
def make_prompt(document: dict) -> str:
    """Build the prompt used to split one source document into overlapping chunks."""
    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
    return f"""
You take a document and you split the document into overlapping chunks for a KnowledgeBase.

The document is from the shared drive of a company called Insurellm.
The document is of type: {document["type"]}
The document has been retrieved from: {document["source"]}

A chatbot will use these chunks to answer questions about the company.
You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.

For each chunk, you should provide a headline, a summary, and the original text of the chunk.
Together your chunks should represent the entire document with overlap.

Here is the document:

{document["text"]}

Respond with the chunks.
"""


def make_messages(document: dict) -> list[dict]:
    """Create message payload for the chunking LLM call."""
    return [{"role": "user", "content": make_prompt(document)}]


def process_document(document: dict) -> list[Result]:
    """Generate structured chunks for a single document and return retrievable results."""
    structured_llm = llm.with_structured_output(Chunks)
    messages = make_messages(document)
    doc_as_chunks_obj = structured_llm.invoke(messages)
    return [chunk.as_result(document) for chunk in doc_as_chunks_obj.chunks]


def create_chunks(documents: list[dict]) -> list[Result]:
    """Process all loaded documents into chunk results."""
    chunks = []
    for document in tqdm(documents):
        chunks.extend(process_document(document))
    return chunks



In [18]:
documents = fetch_documents()
print(create_chunks(documents[:1]))

Loaded 76 documents


100%|██████████| 1/1 [00:07<00:00,  7.22s/it]

[Result(page_content='Company Overview and Structure\n\nInsurellm is an innovative insurance tech firm founded in 2015 with 32 employees operating primarily remotely across the US. The company has offices in San Francisco (HQ), New York, Austin, Chicago, and Denver, and has evolved from a high-growth startup to a lean, profitable operation focused on sustainable growth and operational excellence.\n\n# Overview of Insurellm\n\nInsurellm is an innovative insurance tech firm with 32 employees operating primarily remotely across the US, with offices in San Francisco (HQ), New York, Austin, Chicago, and Denver.\n\nFounded in 2015, the company has evolved from a high-growth startup to a lean, profitable operation focused on sustainable growth and operational excellence.', metadata={'source': 'knowledge-base/company/overview.md', 'type': 'company'})]





In [19]:
def create_embeddings(chunks: list[Result]) -> None:
    """Embed chunk texts and store them in a persistent ChromaDB collection."""
    chroma = PersistentClient(path=DB_NAME)
    if collection_name in [collection.name for collection in chroma.list_collections()]:
        chroma.delete_collection(collection_name)

    texts = [chunk.page_content for chunk in chunks]
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    vectors = embeddings.embed_documents(texts)

    collection = chroma.get_or_create_collection(collection_name)
    ids = [str(index) for index in range(len(chunks))]
    metadatas = [chunk.metadata for chunk in chunks]

    collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metadatas)
    print(f"Vectorstore created with {collection.count()} documents")

In [20]:
def fetch_context_unranked(question: str) -> list[Result]:
    """Retrieve top-k nearest chunks from ChromaDB before reranking."""
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    query_vector = embeddings.embed_query(question)

    chroma = PersistentClient(path=DB_NAME)
    collection = chroma.get_or_create_collection(collection_name)
    results = collection.query(query_embeddings=[query_vector], n_results=RETRIEVAL_K)

    chunks = []
    for document, metadata in zip(results["documents"][0], results["metadatas"][0]):
        chunks.append(Result(page_content=document, metadata=metadata))
    return chunks


def rerank(question: str, chunks: List[Result]) -> List[Result]:
    """Rerank retrieved chunks by relevance using Claude structured output."""
    system_prompt = """
You are a document re-ranker.
You are provided with a question and a list of relevant chunks of text from a query of a knowledge base.
The chunks are provided in the order they were retrieved; this should be approximately ordered by relevance,
but you may be able to improve on that.

You must rank order the provided chunks by relevance to the question, with the most relevant chunk first.

Reply ONLY with valid JSON in the following format:
{"order": [<chunk_id_1>, <chunk_id_2>, ...]}

Include all the chunk ids you are provided with, reranked.
"""

    user_prompt_lines = [
        f"The user has asked the following question:\n\n{question}\n",
        "Order all the chunks of text by relevance to the question, from most relevant to least relevant.",
        "Here are the chunks:\n",
    ]

    for index, chunk in enumerate(chunks, start=1):
        user_prompt_lines.append(f"# CHUNK ID: {index}\n\n{chunk.page_content}\n")

    user_prompt = "\n".join(user_prompt_lines)
    messages = [
        SystemMessage(content=system_prompt.strip()),
        HumanMessage(content=user_prompt.strip()),
    ]

    structured_llm = llm.with_structured_output(RankOrder)
    rank_order = structured_llm.invoke(messages)
    order = rank_order.order
    return [chunks[index - 1] for index in order]


def fetch_context(question: str) -> list[Result]:
    """Retrieve and rerank chunks for a user question."""
    chunks = fetch_context_unranked(question)
    return rerank(question, chunks)

In [21]:
def rewrite_query(question: str, history: Optional[List[dict]] = None) -> str:
    """Rewrite a user question into a focused knowledge-base retrieval query."""
    if history is None:
        history = []

    history_str = ""
    if history:
        lines = []
        for message in history:
            role = "User" if message.get("role", "").lower() == "user" else "Assistant"
            content = message.get("content", "").strip()
            if content:
                lines.append(f"{role}: {content}")
        history_str = "\n".join(lines)
    else:
        history_str = "(no history)"

    system_content = f"""
You are in a conversation with a user, answering questions about the company Insurellm.
You are about to look up information in a Knowledge Base to answer the user's question.

This is the history of your conversation so far with the user:
{history_str}

And this is the user's current question:
{question}

Respond only with a single, refined question that you will use to search the Knowledge Base.
It should be a VERY short specific question most likely to surface content. Focus on the question details.
Don't mention the company name unless it's a general question about the company.
IMPORTANT: Respond ONLY with the knowledgebase query, nothing else.
"""

    messages = [
        SystemMessage(content=system_content),
        HumanMessage(content="Please rewrite now."),
    ]

    response = llm.invoke(messages)
    query = response.content.strip()

    for bad_start in ["Query:", "Search:", '"', "Refined query:", "Here is the query:"]:
        if query.startswith(bad_start):
            query = query[len(bad_start):].strip()

    return query.strip('"').strip()

In [22]:
SYSTEM_PROMPT = """
You are a knowledgeable, friendly assistant representing the company Insurellm.
You are chatting with a user about Insurellm.
Your answer will be evaluated for accuracy, relevance and completeness, so make sure it only answers the question and fully answers it.
If you don't know the answer, say so.
For context, here are specific extracts from the Knowledge Base that might be directly relevant to the user's question:
{context}

With this context, please answer the user's question. Be accurate, relevant and complete.
"""


def make_rag_messages(question: str, history: List[dict], chunks: List[Result]) -> List[dict]:
    """Create chat messages for final answer generation using retrieved context."""
    context = "\n\n".join(
        f"Extract from {chunk.metadata['source']}:\n{chunk.page_content}" for chunk in chunks
    )
    system_prompt = SYSTEM_PROMPT.format(context=context)
    return [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": question}]


def answer_question(question: str, history: Optional[List[dict]] = None) -> Tuple[str, List[Result]]:
    """Run full RAG flow: rewrite, retrieve, rerank, and answer."""
    if history is None:
        history = []

    query = rewrite_query(question, history)
    chunks = fetch_context(query)
    message_dicts = make_rag_messages(question, history, chunks)

    messages: List[BaseMessage] = []
    for message in message_dicts:
        role = message.get("role", "").lower()
        content = message.get("content", "")
        if role == "system":
            messages.append(SystemMessage(content=content))
        elif role == "user":
            messages.append(HumanMessage(content=content))
        elif role == "assistant":
            messages.append(AIMessage(content=content))

    response = llm.invoke(messages)
    answer = response.content.strip()
    return answer, chunks

In [32]:
def run_pipeline_demo() -> None:
    """Build the knowledge base and run two example RAG questions."""
    documents = fetch_documents()
    print(documents[:5])
    chunks = create_chunks(documents[:5])
    create_embeddings(chunks)

    demo_questions = [
        "Tell me about the Insurellm?",
        "Who went to Manchester University?",
    ]

    for question in demo_questions:
        answer, _ = answer_question(question, [])
        print(f"\nQuestion: {question}")
        print(f"Answer: {answer}\n")


run_pipeline_demo()

Loaded 76 documents
[{'type': 'company', 'source': 'knowledge-base/company/overview.md', 'text': "# Overview of Insurellm\n\nInsurellm is an innovative insurance tech firm with 32 employees operating primarily remotely across the US, with offices in San Francisco (HQ), New York, Austin, Chicago, and Denver.\n\nFounded in 2015, the company has evolved from a high-growth startup to a lean, profitable operation focused on sustainable growth and operational excellence.\n\n## Products\n\nInsurellm offers 8 insurance software products across multiple insurance lines:\n\n### Core Insurance Portals\n- **Carllm** - Auto insurance platform for insurers\n- **Homellm** - Home insurance platform for insurers\n- **Lifellm** - Life insurance platform with AI-powered underwriting\n- **Healthllm** - Comprehensive health insurance platform\n- **Bizllm** - Commercial insurance platform for business coverage\n\n### Marketplace & Infrastructure\n- **Markellm** - Marketplace connecting consumers with insura

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:48<00:00,  9.79s/it]


Vectorstore created with 41 documents

Question: Tell me about the Insurellm?
Answer: # About Insurellm

Insurellm is an innovative insurance technology company that's revolutionizing the insurance industry through cutting-edge software solutions.

## Company Overview

**Founded:** 2015 by Avery Lancaster

**Current Size:** 32 employees operating primarily remotely across the US

**Headquarters:** San Francisco, with satellite offices in New York, Austin, Chicago, and Denver

## Vision & Mission

**Vision:** To revolutionize the insurance industry through innovative technology that makes insurance accessible, transparent, and effortless for everyone.

**Mission:** We empower insurance providers and consumers with cutting-edge software solutions that streamline processes, enhance customer experiences, and drive meaningful connections in the insurance marketplace.

## Products & Services

Insurellm offers **8 comprehensive insurance software products** across multiple insurance lines:

-