### Setup

In [None]:
import os
# import litellm
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from chromadb import PersistentClient
from tqdm import tqdm
from litellm import completion
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from ollama import Client
from utils import fetch_documents


load_dotenv(override=True)
# litellm.enable_json_schema_validation = True

MODEL="ollama/gpt-oss:120b"
api_base="https://ollama.com"
extra_headers={
    "Authorization": os.environ.get('OLLAMA_API_KEY')
}


In [None]:
# Inspired by LangChain's Document - let's have something similar

class Result(BaseModel):
    page_content: str
    metadata: dict

# A class to perfectly represent a chunk

class Chunk(BaseModel):
    headline: str = Field(description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query")
    summary: str = Field(description="A few sentences summarizing the content of this chunk to answer common questions")
    original_text: str = Field(description="The original text of this chunk from the provided document, exactly as is, not changed in any way")

    def as_result(self, document):
        metadata = {"source": document["source"], "type": document["type"]}
        return Result(page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,metadata=metadata)


class Chunks(BaseModel):
    chunks: list[Chunk]

In [None]:
documents = fetch_documents()

In [None]:
def make_prompt(document):
    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
    return f"""
    You take a document and you split the document into overlapping chunks for a KnowledgeBase.

    The document is from the shared drive of a company called Insurellm.
    The document is of type: {document["type"]}
    The document has been retrieved from: {document["source"]}

    A chatbot will use these chunks to answer questions about the company.
    You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
    This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
    There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.

    For each chunk, you should provide a headline, a summary, and the original text of the chunk.
    Together your chunks should represent the entire document with overlap.

    Here is the document:

    {document["text"]}

    Respond with the chunks.
    """

In [None]:
print(make_prompt(documents[0]))

In [None]:
def make_messages(document):
    return [
        {"role": "user", "content": make_prompt(document)},
    ]

In [None]:
print(make_messages(documents[0]))

In [None]:
def process_document(document):
    messages = make_messages(document)
    # response = completion(model=MODEL, messages=messages, api_base=api_base, extra_headers=extra_headers, response_format=Chunks)
    # response = completion(model='ollama/deepseek-v3.1:671b-cloud', messages=messages, api_base=api_base, extra_headers=extra_headers, response_format=Chunks, format=Chunks.model_json_schema())
    response = completion(model='gpt-4.1-nano', messages=messages, response_format=Chunks)
    reply = response.choices[0].message.content
    doc_as_chunks = Chunks.model_validate_json(reply).chunks
    return [chunk.as_result(document) for chunk in doc_as_chunks]

In [None]:
process_document(documents[0])