# Primitive Chunker

Split raw MDX documents into fixed-size text chunks.

**Features:**
- Fixed 500-token chunks
- Fast processing for prototyping
- Output format ready for embedding and database storage
- Baseline for comparison with semantic chunking


## Section 1: Chunk Data Structure and Chunker Class

Defines the `Chunk` data structure and the `PrimitiveChunker` class for fixed-size token-based chunking. The chunker splits documents into uniform chunks with unique IDs and demo code references.

In [None]:
from dataclasses import dataclass
from typing import List, Optional, Dict


@dataclass
class Chunk:
    """Represents a text chunk with metadata."""
    id: int
    component: str
    section_path: str
    content: str
    demo_code: Dict[str, str]
    parent_id: Optional[int]
    level: int


class PrimitiveChunker:
    """Fixed-size token-based chunker."""
    
    def __init__(self, max_chunk_tokens: int = 500):
        """Initialize chunker with target token size."""
        self.max_chunk_tokens = max_chunk_tokens
        self.chunk_id_counter = 0
    
    @staticmethod
    def estimate_tokens(text: str) -> int:
        """Estimate tokens: approximately 1 token per 4 characters."""
        return max(1, len(text) // 4)
    
    def chunk_document(self, doc: dict) -> List[Chunk]:
        """Split document into fixed-size chunks."""
        chunks = []
        component = doc.get('component', 'unknown')
        
        all_content = doc.get('sections', [{}])[0].get('content', '')
        
        # Build demo_map from top-level demos
        demo_map = {}
        if top_level_demos := doc.get('demos'):
            for demo in top_level_demos:
                if isinstance(demo, dict) and (demo_file := demo.get('file')) and (code := demo.get('code')):
                    demo_map[demo_file] = code
        
        chunk_size = self.max_chunk_tokens * 4
        
        for i in range(0, len(all_content), chunk_size):
            chunk_text = all_content[i:i + chunk_size]
            
            chunks.append(Chunk(
                id=self.chunk_id_counter,
                component=component,
                section_path=component,
                content=chunk_text,
                demo_code=demo_map,
                parent_id=None,
                level=0
            ))
            self.chunk_id_counter += 1
        
        return chunks
    
    def chunk_documents(self, docs: List[dict]) -> List[Chunk]:
        """Process multiple documents."""
        all_chunks = []
        for doc in docs:
            all_chunks.extend(self.chunk_document(doc))
        return all_chunks

## Section 2: Load Raw MDX Documents

Loads all raw MDX documentation files from `data/raw/` and converts them into a standardized document format for chunking. Each document includes the component name, content, and any associated demo code.

In [5]:
import json
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent

RAW_DATA_DIR = PROJECT_ROOT / 'data/raw'
CHUNKS_DIR = PROJECT_ROOT / 'data/chunks'

# Load raw MDX files
mdx_files = sorted(RAW_DATA_DIR.glob('**/*.mdx'))
print(f"Found {len(mdx_files)} raw MDX files\n")

docs = []
for mdx_file in mdx_files:
    content = mdx_file.read_text()
    doc = {
        'component': mdx_file.stem,
        'metadata': {},
        'demos': [],
        'sections': [{
            'title': mdx_file.stem,
            'content': content,
            'children': []
        }]
    }
    docs.append(doc)



Found 71 raw MDX files



## Section 3: Generate Chunks and Save to JSON

Processes all documents using the `PrimitiveChunker` to split content into fixed-size chunks. Each chunk receives a unique ID, component reference, and associated demo code. Results are saved to `chunks_primitive.json` for embedding and database import.

In [6]:
# Generate chunks
chunker = PrimitiveChunker(max_chunk_tokens=500)
all_chunks = chunker.chunk_documents(docs)

print(f"Created {len(all_chunks)} chunks\n")

# Convert to JSON-serializable format
chunks_data = [
    {
        'id': chunk.id,
        'component': chunk.component,
        'section_path': chunk.section_path,
        'content': chunk.content,
        'demo_code': chunk.demo_code,
        'parent_id': chunk.parent_id,
        'level': chunk.level
    }
    for chunk in all_chunks
]

CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
output_file = CHUNKS_DIR / 'chunks_primitive.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(chunks_data, f, indent=2, ensure_ascii=False)

file_size_mb = output_file.stat().st_size / (1024 * 1024)
print(f"Chunks saved to: {output_file}")
print(f"File size: {file_size_mb:.2f} MB")
print(f"Total chunks: {len(all_chunks):,}")


Created 186 chunks

Chunks saved to: /home/sinan/GitHub/reservix/ai-assistant/etl/data/chunks/chunks_primitive.json
File size: 0.33 MB
Total chunks: 186
