# Primitive Chunker

Split raw MDX documents into fixed-size text chunks.

**Features:**
- Fixed 500-token chunks
- Fast processing for prototyping
- Output format ready for embedding and database storage
- Baseline for comparison with semantic chunking


In [4]:
## Section 1: Chunk Data Structure and Chunker Class

from dataclasses import dataclass
from typing import List, Optional


@dataclass
class Chunk:
    """Represents a text chunk with metadata."""
    component: str
    section_title: Optional[str]
    section_path: Optional[str]
    content: str
    demo_files: List[str]
    images: List[str]
    token_count: int


class PrimitiveChunker:
    """Fixed-size token-based chunker."""
    
    def __init__(self, max_chunk_tokens: int = 500):
        """Initialize chunker with target token size."""
        self.max_chunk_tokens = max_chunk_tokens
    
    @staticmethod
    def estimate_tokens(text: str) -> int:
        """Estimate tokens: approximately 1 token per 4 characters."""
        return max(1, len(text) // 4)
    
    def chunk_document(self, doc: dict) -> List[Chunk]:
        """Split document into fixed-size chunks."""
        chunks = []
        component = doc.get('component', 'unknown')
        
        all_content = doc.get('sections', [{}])[0].get('content', '')
        
        chunk_size = self.max_chunk_tokens * 4
        
        for i in range(0, len(all_content), chunk_size):
            chunk_text = all_content[i:i + chunk_size]
            token_count = self.estimate_tokens(chunk_text)
            
            chunks.append(Chunk(
                component=component,
                section_title=None,
                section_path=None,
                content=chunk_text,
                demo_files=doc.get('demos', []),
                images=[],
                token_count=token_count
            ))
        
        return chunks
    
    def chunk_documents(self, docs: List[dict]) -> List[Chunk]:
        """Process multiple documents."""
        all_chunks = []
        for doc in docs:
            all_chunks.extend(self.chunk_document(doc))
        return all_chunks

In [5]:
import json
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent

RAW_DATA_DIR = PROJECT_ROOT / 'data/raw'
CHUNKS_DIR = PROJECT_ROOT / 'data/chunks'

# Load raw MDX files
mdx_files = sorted(RAW_DATA_DIR.glob('**/*.mdx'))
print(f"Found {len(mdx_files)} raw MDX files\n")

docs = []
for mdx_file in mdx_files:
    content = mdx_file.read_text()
    doc = {
        'component': mdx_file.stem,
        'metadata': {},
        'demos': [],
        'sections': [{
            'title': mdx_file.stem,
            'content': content,
            'children': []
        }]
    }
    docs.append(doc)



Found 71 raw MDX files



In [7]:
# Generate chunks
chunker = PrimitiveChunker(max_chunk_tokens=500)
all_chunks = chunker.chunk_documents(docs)

print(f"Created {len(all_chunks)} chunks\n")
print("Token distribution:")
token_counts = [c.token_count for c in all_chunks]
print(f"  Min: {min(token_counts)}")
print(f"  Max: {max(token_counts)}")
print(f"  Avg: {sum(token_counts) / len(token_counts):.0f}")
print(f"  Total: {sum(token_counts):,} tokens")

# Convert to JSON-serializable format
chunks_data = [
    {
        'component': chunk.component,
        'section_title': chunk.section_title,
        'section_path': chunk.section_path,
        'content': chunk.content,
        'demo_files': chunk.demo_files,
        'images': chunk.images,
        'token_count': chunk.token_count
    }
    for chunk in all_chunks
]

CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
output_file = CHUNKS_DIR / 'chunks_primitive.json'

with open(output_file, 'w') as f:
    json.dump(chunks_data, f, indent=2)

file_size_mb = output_file.stat().st_size / (1024 * 1024)
print(f"\nFilesize: {file_size_mb:.2f} MB")


Created 186 chunks

Token distribution:
  Min: 4
  Max: 500
  Avg: 403
  Total: 74,998 tokens

Filesize: 0.33 MB
