In [8]:
from typing import List, Dict

def chunk_document(document_content: str, chunk_size: int, overlap: int) -> List[Dict]:
    if chunk_size <= 0 or overlap < 0 or overlap >= chunk_size:
        raise ValueError("Invalid chunk_size or overlap parameters")
    
    if not document_content:
        return []

    characters_per_page = 2000
    chunks = []
    start_idx = 0
    chunk_index = 0
    content_length = len(document_content)

    while start_idx < content_length:
        end_idx = min(start_idx + chunk_size, content_length)
        chunk_content = document_content[start_idx:end_idx]
        
        page_number = (start_idx // characters_per_page) + 1
        
        chunks.append({
            "content": chunk_content,
            "chunk_index": chunk_index,
            "page_number": page_number
        })
        
        if end_idx == content_length:
            break
            
        start_idx += chunk_size - overlap
        chunk_index += 1

    return chunks

In [9]:
text = "1234567890ABCDEFGHIJ"
result = chunk_document(text, chunk_size=10, overlap=3)


In [10]:
result

[{'content': '1234567890', 'chunk_index': 0, 'page_number': 1},
 {'content': '890ABCDEFG', 'chunk_index': 1, 'page_number': 1},
 {'content': 'EFGHIJ', 'chunk_index': 2, 'page_number': 1}]

In [11]:
from langchain.text_splitter import CharacterTextSplitter
from typing import List, Dict

def chunk_document_TS(document_content: str, chunk_size: int, overlap: int) -> List[Dict]:
    
    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=len,
        separator=""
    )
    chunks = text_splitter.split_text(document_content)
    start_indexes = []
    current_pos = 0
    for chunk in chunks:
        start_indexes.append(current_pos)
        current_pos += len(chunk) - overlap
    return [
        {
            "content": chunk,
            "chunk_index": idx,
            "page_number": (start_idx // 2000) + 1
        }
        for idx, (chunk, start_idx) in enumerate(zip(chunks, start_indexes))
    ]

In [14]:
text = "1234567890ABCDEFGHIJ"
result2 = chunk_document_TS(text, chunk_size=10, overlap=3)

In [15]:
result2

[{'content': '1234567890', 'chunk_index': 0, 'page_number': 1},
 {'content': '890ABCDEFG', 'chunk_index': 1, 'page_number': 1},
 {'content': 'EFGHIJ', 'chunk_index': 2, 'page_number': 1}]