## Overview

The TextChunker processes parsed documents into semantic chunks:
- Token-based splitting with configurable overlap
- Respects section boundaries
- Extracts demo files and image metadata
- Recursive section hierarchy processing

## Implementation

In [None]:
from dataclasses import dataclass
from typing import List, Optional
import re


@dataclass
class Chunk:
    """Represents a semantic chunk of content."""
    component: str
    section_title: Optional[str]
    section_path: Optional[str]
    content: str
    demo_files: List[str]
    images: List[str]
    token_count: int


class TextChunker:
    """Semantic text chunking based on document sections."""
    
    def __init__(self, max_chunk_tokens: int = 512, overlap_tokens: int = 50):
        """
        Initialize chunker.
        
        Args:
            max_chunk_tokens: Maximum tokens per chunk
            overlap_tokens: Token overlap between chunks
        """
        self.max_chunk_tokens = max_chunk_tokens
        self.overlap_tokens = overlap_tokens
    
    def estimate_tokens(self, text: str) -> int:
        """
        Estimate token count using 4 characters per token.
        
        Args:
            text: Text to count tokens for
        
        Returns:
            Estimated token count
        """
        return max(1, len(text) // 4)
    
    def chunk_document(self, document: dict) -> List[Chunk]:
        """
        Chunk a single parsed document.
        
        Args:
            document: Document dict from generate_ast.ts output
        
        Returns:
            List of Chunk objects
        """
        # Handle both old and new formats
        component = document.get("component") or document.get("name", "unknown")
        chunks = []
        
        sections = document.get("sections", [])
        self._process_sections(component, sections, chunks)
        
        return chunks
    
    def _process_sections(
        self,
        component: str,
        sections: list,
        chunks: List[Chunk],
        parent_path: str = ""
    ):
        """
        Recursively process sections and create chunks.
        
        Args:
            component: Component name
            sections: List of section dicts
            chunks: List to accumulate chunks
            parent_path: Path to parent section for building hierarchy
        """
        for section in sections:
            if isinstance(section, str):
                continue
            
            section_title = section.get("title", "")
            section_path = f"{parent_path} > {section_title}".strip("> ")
            
            # Gather content and metadata
            content_parts = []
            demo_files = []
            images = []
            
            # Add main content
            if content := section.get("content"):
                if isinstance(content, str):
                    content_parts.append(content)
                elif isinstance(content, list):
                    for item in content:
                        if isinstance(item, str):
                            content_parts.append(item)
                        elif isinstance(item, dict):
                            if item.get("type") == "paragraph":
                                content_parts.append(item.get("text", ""))
                            elif item.get("type") == "demo":
                                if demo_file := item.get("file"):
                                    demo_files.append(demo_file)
                            elif item.get("type") == "image":
                                if image_path := item.get("src"):
                                    images.append(image_path)
            
            # Add demo file references
            if demos := section.get("demos"):
                if isinstance(demos, list):
                    for demo in demos:
                        if isinstance(demo, dict) and (file := demo.get("file")):
                            demo_files.append(file)
            
            # Add image references
            if img_list := section.get("images"):
                if isinstance(img_list, list):
                    for img in img_list:
                        if isinstance(img, dict) and (src := img.get("src")):
                            images.append(src)
            
            # Combine content
            combined_content = " ".join(str(p) for p in content_parts).strip()
            
            # Create chunk if has content
            if combined_content:
                token_count = self.estimate_tokens(combined_content)
                chunks.append(Chunk(
                    component=component,
                    section_title=section_title,
                    section_path=section_path,
                    content=combined_content,
                    demo_files=demo_files,
                    images=images,
                    token_count=token_count
                ))
            
            # Process children (new format) or subsections (old format)
            if children := section.get("children"):
                self._process_sections(
                    component,
                    children,
                    chunks,
                    section_path
                )
            elif subsections := section.get("subsections"):
                self._process_sections(
                    component,
                    subsections,
                    chunks,
                    section_path
                )
    
    def chunk_documents(self, documents: List[dict]) -> List[Chunk]:
        """
        Chunk multiple documents.
        
        Args:
            documents: List of document dicts
        
        Returns:
            Combined list of chunks from all documents
        """
        all_chunks = []
        for doc in documents:
            all_chunks.extend(self.chunk_document(doc))
        return all_chunks


print("TextChunker module loaded successfully")

: 

## Usage Example

In [None]:
# Example document structure (from generate_ast.ts output)
example_doc = {
    "name": "Button",
    "description": "A clickable button component",
    "sections": [
        {
            "title": "Usage",
            "description": "How to use the Button component",
            "content": [
                {"type": "paragraph", "text": "The Button component is a fundamental UI element."},
                {"type": "demo", "file": "button-demo.tsx", "description": "Basic button example"},
                {"type": "image", "src": "button.png"}
            ],
            "subsections": [
                {
                    "title": "States",
                    "content": [
                        {"type": "paragraph", "text": "Buttons can have multiple states like disabled, loading, etc."}
                    ]
                }
            ]
        }
    ]
}

# Create chunker and process
chunker = TextChunker(max_chunk_tokens=512, overlap_tokens=50)
chunks = chunker.chunk_document(example_doc)

# Display results
print(f"Created {len(chunks)} chunks:\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:")
    print(f"  Component: {chunk.component}")
    print(f"  Section: {chunk.section_path}")
    print(f"  Tokens: {chunk.token_count}")
    print(f"  Demos: {chunk.demo_files}")
    print(f"  Content preview: {chunk.content[:80]}...")
    print()