In [None]:
# Chonkie Python Examples - Text Chunking for RAG Applications

# First, install Chonkie:
# pip install chonkie
# pip install chonkie[tokenizers]  # For tokenizer support

# Basic example with TokenChunker
from chonkie import TokenChunker
from tokenizers import Tokenizer

# Initialize tokenizer
tokenizer = Tokenizer.from_pretrained("gpt2")

# Create chunker
chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=50)

# Process text
text = """
Chonkie is a revolutionary text chunking library designed for RAG applications.
It provides lightning-fast performance with minimal overhead, making it perfect
for production environments. The library offers multiple chunking strategies
including token-based, sentence-based, and semantic chunking approaches.
"""

chunks = chunker.chunk(text)
print(f"Number of chunks: {len(chunks)}")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk.text[:100]}...")

# ============================================================================
# Different Chunking Strategies
# ============================================================================

# 1. Sentence-based chunking
from chonkie import SentenceChunker

sentence_chunker = SentenceChunker(chunk_size=2)  # 2 sentences per chunk
sentence_chunks = sentence_chunker.chunk(text)

print("\n--- Sentence Chunking ---")
for i, chunk in enumerate(sentence_chunks):
    print(f"Sentence Chunk {i+1}: {chunk.text}")

# 2. Semantic chunking (if available)
try:
    from chonkie import SemanticChunker
    
    semantic_chunker = SemanticChunker()
    semantic_chunks = semantic_chunker.chunk(text)
    
    print("\n--- Semantic Chunking ---")
    for i, chunk in enumerate(semantic_chunks):
        print(f"Semantic Chunk {i+1}: {chunk.text[:100]}...")
except ImportError:
    print("\nSemantic chunking requires additional dependencies")

# 3. Hierarchical chunking
try:
    from chonkie import HierarchicalChunker
    
    hierarchical_chunker = HierarchicalChunker()
    hierarchical_chunks = hierarchical_chunker.chunk(text)
    
    print("\n--- Hierarchical Chunking ---")
    for i, chunk in enumerate(hierarchical_chunks):
        print(f"Hierarchical Chunk {i+1}: {chunk.text[:100]}...")
except ImportError:
    print("\nHierarchical chunking requires additional dependencies")

# ============================================================================
# Working with longer texts and files
# ============================================================================

def process_document(file_path, chunk_size=512, overlap=50):
    """Process a text document using Chonkie"""
    
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Initialize chunker
    tokenizer = Tokenizer.from_pretrained("gpt2")
    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=overlap)
    
    # Chunk the content
    chunks = chunker.chunk(content)
    
    return chunks

# Example usage (uncomment if you have a text file):
# chunks = process_document("sample_document.txt")
# print(f"Processed document into {len(chunks)} chunks")

# ============================================================================
# Advanced Configuration
# ============================================================================

# Custom tokenizer configuration
custom_tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
advanced_chunker = TokenChunker(
    tokenizer=custom_tokenizer,
    chunk_size=256,
    chunk_overlap=25,
    min_chunk_size=50  # Minimum chunk size
)

# Process with custom settings
sample_text = """
Natural Language Processing (NLP) is a field of artificial intelligence 
that focuses on the interaction between computers and humans using natural language.
The ultimate objective of NLP is to read, decipher, understand, and make sense 
of human languages in a manner that is valuable. Most NLP techniques rely on 
machine learning to derive meaning from human languages.
"""

advanced_chunks = advanced_chunker.chunk(sample_text)
print(f"\n--- Advanced Configuration ---")
print(f"Created {len(advanced_chunks)} chunks with custom settings")

# ============================================================================
# Integration with RAG Pipeline
# ============================================================================

def prepare_for_rag(text, chunk_size=400, overlap=40):
    """Prepare text for RAG by chunking and adding metadata"""
    
    tokenizer = Tokenizer.from_pretrained("gpt2")
    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=overlap)
    
    chunks = chunker.chunk(text)
    
    # Add metadata for each chunk
    prepared_chunks = []
    for i, chunk in enumerate(chunks):
        chunk_data = {
            'id': f"chunk_{i}",
            'text': chunk.text,
            'start_index': chunk.start_index,
            'end_index': chunk.end_index,
            'token_count': len(tokenizer.encode(chunk.text).ids),
        }
        prepared_chunks.append(chunk_data)
    
    return prepared_chunks

# Example RAG preparation
rag_chunks = prepare_for_rag(sample_text)
print(f"\n--- RAG Preparation ---")
for chunk in rag_chunks:
    print(f"ID: {chunk['id']}, Tokens: {chunk['token_count']}, Text: {chunk['text'][:60]}...")

print("\n🦛 Chonkie makes text chunking simple and fast!")