# Semantic Chunker

Split parsed MDX documents into semantic chunks based on document structure.

**Features:**
- Semantic chunking by document sections
- Hierarchical structure with parent-child relationships
- Demo code and image references
- Token statistics for each section

## 1. Data Structures & Chunker Class

Defines the `Chunk` data structure and the `TextChunker` class for semantic chunking based on document sections.

In [None]:
from dataclasses import dataclass
from typing import List, Optional


@dataclass
class Chunk:
    """Represents a semantic chunk of content."""
    component: str
    section_title: Optional[str]
    section_path: Optional[str]
    content: str
    demo_files: List[str]
    images: List[str]
    token_count: int


class TextChunker:
    """Semantic text chunking based on document sections."""
    
    def __init__(self, max_chunk_tokens: int = 500, overlap_tokens: int = 50):
        """Initialize chunker with target token size and overlap."""
        self.max_chunk_tokens = max_chunk_tokens
        self.overlap_tokens = overlap_tokens
    
    @staticmethod
    def estimate_tokens(text: str) -> int:
        """Estimate tokens: approximately 1 token per 4 characters."""
        return max(1, len(text) // 4)
    
    def _process_sections(
        self,
        component: str,
        sections: list,
        chunks: List[Chunk],
        parent_path: str = ""
    ):
        """Recursively process sections and extract content."""
        for section in sections:
            if isinstance(section, str):
                continue
            
            section_title = section.get("title", "")
            section_path = f"{parent_path} > {section_title}".strip("> ")
            
            # Gather content
            content_parts = []
            demo_files = []
            images = []
            
            # Add main content
            if content := section.get("content"):
                if isinstance(content, str):
                    content_parts.append(content)
                elif isinstance(content, list):
                    for item in content:
                        if isinstance(item, str):
                            content_parts.append(item)
                        elif isinstance(item, dict):
                            if item.get("type") == "paragraph":
                                content_parts.append(item.get("text", ""))
                            elif item.get("type") == "demo":
                                if demo_file := item.get("file"):
                                    demo_files.append(demo_file)
                            elif item.get("type") == "image":
                                if image_path := item.get("src"):
                                    images.append(image_path)
            
            # Add demo file references
            if demos := section.get("demos"):
                if isinstance(demos, list):
                    for demo in demos:
                        if isinstance(demo, str):
                            demo_files.append(demo)
                        elif isinstance(demo, dict) and (file := demo.get("file")):
                            demo_files.append(file)
            
            # Add image references
            if img_list := section.get("images"):
                if isinstance(img_list, list):
                    for img in img_list:
                        if isinstance(img, dict) and (src := img.get("src")):
                            images.append(src)
            
            # Combine content and create chunk
            combined_content = " ".join(str(p).strip() for p in content_parts if str(p).strip()).strip()
            
            if combined_content:
                token_count = self.estimate_tokens(combined_content)
                chunks.append(Chunk(
                    component=component,
                    section_title=section_title,
                    section_path=section_path,
                    content=combined_content,
                    demo_files=demo_files,
                    images=images,
                    token_count=token_count
                ))
            
            # Process children recursively
            if children := section.get("children"):
                self._process_sections(component, children, chunks, section_path)
            elif subsections := section.get("subsections"):
                self._process_sections(component, subsections, chunks, section_path)
    
    def chunk_documents(self, documents: List[dict]) -> List[Chunk]:
        """Chunk multiple documents and return all chunks."""
        all_chunks = []
        for doc in documents:
            component = doc.get("component") or doc.get("name", "unknown")
            sections = doc.get("sections", [])
            self._process_sections(component, sections, all_chunks)
        return all_chunks


print("TextChunker class loaded successfully")

✓ TextChunker class loaded successfully


## 2. Load Parsed Documents

Loads all processed JSON files from `data/processed/` into memory.

In [None]:
import json
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent
PROCESSED_DATA_DIR = PROJECT_ROOT / 'data/processed'
CHUNKS_DIR = PROJECT_ROOT / 'data/chunks'

# Load all processed JSON files
json_files = sorted(PROCESSED_DATA_DIR.glob('*.json'))
print(f"Found JSON files: {len(json_files)}\n")

docs = []
for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        doc = json.load(f)
        docs.append(doc)

print(f"{len(docs)} documents loaded")
print(f"Directory: {PROCESSED_DATA_DIR}")

Found JSON files: 71

✓ 71 documents loaded
  Directory: /home/sinan/GitHub/reservix/ai-assistant/etl/data/processed


## 3. Create Chunks with Parent-Child Relationships

Processes all documents and creates semantic chunks with:
- Unique chunk IDs
- Parent ID for hierarchical relationships
- Hierarchy levels (0=Root, 1=Subsection, etc.)
- Token statistics for each section

In [None]:
# Initialize chunker and process all documents
all_chunks = []
chunk_id_counter = 1

# Process each document with new flat structure
for doc in docs:
    component = doc.get("component", "unknown")
    sections = doc.get("sections", [])
    
    # Build demo_map from top-level demos in the document
    demo_map = {}
    if top_level_demos := doc.get("demos"):
        for demo in top_level_demos:
            if isinstance(demo, dict) and (demo_file := demo.get("file")) and (code := demo.get("code")):
                demo_map[demo_file] = code
    
    # Track heading -> chunk_id mapping for parent_id resolution
    heading_to_id = {}
    
    # Process flat sections list (new structure)
    for section in sections:
        heading = section.get("heading", "")
        level = section.get("level", 0)
        content = section.get("content", "")
        path = section.get("path", [])
        parent = section.get("parent")
        demo_files = section.get("demos", [])
        
        # Skip empty sections
        if not content.strip():
            continue
        
        # Build section_path from path array
        section_path = " > ".join(path) if path else heading
        
        # Get demo code from demo_map
        demo_code = {}
        for demo_file in demo_files:
            if demo_file in demo_map:
                demo_code[demo_file] = demo_map[demo_file]
        
        # Resolve parent_id from parent heading name
        parent_id = heading_to_id.get(parent) if parent else None
        
        chunk_dict = {
            'id': chunk_id_counter,
            'component': component,
            'section_path': section_path,
            'heading': heading,
            'content': content,
            'demo_code': demo_code,
            'parent_id': parent_id,
            'level': level
        }
        all_chunks.append(chunk_dict)
        
        # Map heading to chunk_id for parent resolution
        heading_to_id[heading] = chunk_id_counter
        chunk_id_counter += 1

print(f"{len(all_chunks)} chunks created\n")

✓ 606 chunks created

Hierarchy levels:
  Level 0:     70 chunks
  Level 1:    282 chunks
  Level 2:    254 chunks


## 4. Save Chunks to JSON

Saves all chunks with metadata to `data/chunks/chunks.json`.

In [None]:
# Save all chunks to JSON file
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
output_file = CHUNKS_DIR / 'chunks.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)

file_size_mb = output_file.stat().st_size / (1024 * 1024)
print(f"Chunks saved to: {output_file}")
print(f"File size: {file_size_mb:.2f} MB")

Chunks saved to: /home/sinan/GitHub/reservix/ai-assistant/etl/data/chunks/chunks.json
File size: 0.45 MB
Total chunks: 606
