In [1]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter
)
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("test.pdf")
documents = loader.load()

print(f"document loader with success number of pages: {len(documents)} pages")

document loader with success number of pages: 13 pages


In [2]:
full_text = " ".join([doc.page_content for doc in documents])

print(f"Total characters: {len(full_text)}")
print(f"Approximate tokens (chars/4): {len(full_text) / 4}")
print("\nüí° This is why we need to split documents into chunks")

Total characters: 3219
Approximate tokens (chars/4): 804.75

üí° This is why we need to split documents into chunks


In [3]:
# Create a simple splitter
text_splitter = CharacterTextSplitter(
    separator="\n\n",  
    chunk_size=1000,
    chunk_overlap=200,
)

# Split our documents
chunks = text_splitter.split_documents(documents)

print(f"‚úÇÔ∏è  Split into {len(chunks)} chunks")
print(f"\nüì¶ First chunk:")
print(chunks[0].page_content)
print(f"\nüì¶ Second chunk:")
print(chunks[1].page_content)

‚úÇÔ∏è  Split into 13 chunks

üì¶ First chunk:
Nagle's algorithm
Delay in the client side
husseinnasser

üì¶ Second chunk:
Nigel Algorithm
‚óè In the telnet days sending a single byte in a segment is a waste
‚óè Combine small segments and send them in a single one
‚óè The client can wait for a full MSS before sending the segment
‚óè No wasted 40 bytes header (IP + TCP) for few bytes of data
husseinnasser


In [4]:
# This is the BEST splitter for most use cases
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],  # Try these in order
    length_function=len,
)

recursive_chunks = recursive_splitter.split_documents(documents)

print(f"‚úÇÔ∏è  Split into {len(recursive_chunks)} chunks")
print(f"\nüì¶ Sample chunk:")
print(recursive_chunks[0].page_content)

‚úÇÔ∏è  Split into 13 chunks

üì¶ Sample chunk:
Nagle's algorithm
Delay in the client side
husseinnasser


In [5]:
chunk = recursive_chunks[0]

print("Original document metadata:", documents[0].metadata)
print("Chunk metadata:", chunk.metadata)

Original document metadata: {'producer': 'PyPDF', 'creator': 'Google', 'creationdate': '', 'title': "Nagle's algorithm", 'source': 'test.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1'}
Chunk metadata: {'producer': 'PyPDF', 'creator': 'Google', 'creationdate': '', 'title': "Nagle's algorithm", 'source': 'test.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1'}


In [6]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into chunks using the best splitter
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
    )
    
    chunks = splitter.split_documents(documents)
    return chunks

In [None]:
chunks = split_documents(