# Chunk Markdown Documents

This notebook splits Markdown files into smaller chunks for RAG processing and saves them to `data/chunks`.

In [2]:
from pathlib import Path
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define directories
processed_dir = Path("../data/processed/health_education")
chunks_dir = Path("../data/chunks")
chunks_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# Configure text splitter
# Chunk size: 1000 characters (good balance for RAG)
# Chunk overlap: 200 characters (maintains context between chunks)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n## ", "\n\n### ", "\n\n", "\n", " ", ""]
)

In [5]:
def extract_metadata_from_path(file_path, processed_dir):
    """
    Extracts metadata from the file path structure.
    
    Args:
        file_path: Path to the markdown file
        processed_dir: Base processed directory
    
    Returns:
        dict: Metadata dictionary
    """
    relative_path = file_path.relative_to(processed_dir)
    parts = relative_path.parts
    
    metadata = {
        "source_file": str(file_path),
        "filename": file_path.name,
        "file_stem": file_path.stem
    }
    
    # Extract type (condition/treatment)
    if len(parts) > 0:
        metadata["type"] = parts[0]
    
    # Extract condition
    if len(parts) > 1:
        metadata["condition"] = parts[1]
    
    return metadata

In [6]:
def chunk_markdown_file(md_path, processed_dir, text_splitter):
    """
    Chunks a Markdown file and returns a list of chunk dictionaries.
    
    Args:
        md_path: Path to the Markdown file
        processed_dir: Base processed directory
        text_splitter: Text splitter instance
    
    Returns:
        list: List of chunk dictionaries
    """
    try:
        print(f"Processing: {md_path}")
        
        # Read markdown content
        with open(md_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract metadata
        metadata = extract_metadata_from_path(md_path, processed_dir)
        
        # Split into chunks
        chunks = text_splitter.split_text(content)
        
        # Create chunk dictionaries with metadata
        chunk_list = []
        for i, chunk_text in enumerate(chunks):
            chunk_dict = {
                "chunk_id": f"{metadata['file_stem']}_chunk_{i:04d}",
                "chunk_index": i,
                "text": chunk_text,
                "metadata": metadata.copy()
            }
            chunk_list.append(chunk_dict)
        
        print(f"  ✓ Created {len(chunk_list)} chunks")
        print(f"  Total characters: {sum(len(c['text']) for c in chunk_list):,}\n")
        
        return chunk_list
    except Exception as e:
        print(f"  ✗ Error processing {md_path}: {e}\n")
        import traceback
        traceback.print_exc()
        return []

In [7]:
# Find all Markdown files in the processed directory
md_files = list(processed_dir.rglob("*.md"))
print(f"Found {len(md_files)} Markdown files to chunk\n")

Found 4 Markdown files to chunk



In [8]:
# Process all Markdown files and create chunks
all_chunks = []
results = []

for md_path in md_files:
    chunks = chunk_markdown_file(md_path, processed_dir, text_splitter)
    
    if chunks:
        all_chunks.extend(chunks)
        results.append({
            "file": str(md_path),
            "chunks_count": len(chunks),
            "success": True
        })
    else:
        results.append({
            "file": str(md_path),
            "chunks_count": 0,
            "success": False
        })

Processing: ../data/processed/health_education/conditions/hypertension/hypertension_medlineplus_overview.md
  ✓ Created 41 chunks
  Total characters: 26,393

Processing: ../data/processed/health_education/conditions/diabetes/diabetes_medlineplus_overview.md
  ✓ Created 46 chunks
  Total characters: 28,944

Processing: ../data/processed/health_education/treatments/hypertension/hypertension_medlineplus_treatment.md
  ✓ Created 21 chunks
  Total characters: 13,058

Processing: ../data/processed/health_education/treatments/diabetes/diabetes_medlineplus_treatment.md
  ✓ Created 23 chunks
  Total characters: 14,037



In [9]:
# Save all chunks to a single JSON file
chunks_file = chunks_dir / "all_chunks.json"

with open(chunks_file, 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)

print(f"✓ Saved {len(all_chunks)} chunks to: {chunks_file}")
print(f"  File size: {chunks_file.stat().st_size:,} bytes")

✓ Saved 131 chunks to: ../data/chunks/all_chunks.json
  File size: 138,453 bytes


In [10]:
# Also save chunks grouped by file for easier access
chunks_by_file = {}
for chunk in all_chunks:
    source_file = chunk['metadata']['source_file']
    if source_file not in chunks_by_file:
        chunks_by_file[source_file] = []
    chunks_by_file[source_file].append(chunk)

chunks_by_file_path = chunks_dir / "chunks_by_file.json"
with open(chunks_by_file_path, 'w', encoding='utf-8') as f:
    json.dump(chunks_by_file, f, indent=2, ensure_ascii=False)

print(f"\n✓ Saved chunks grouped by file to: {chunks_by_file_path}")
print(f"  Files processed: {len(chunks_by_file)}")


✓ Saved chunks grouped by file to: ../data/chunks/chunks_by_file.json
  Files processed: 4


In [11]:
# Summary of chunking process
print("\n" + "=" * 60)
print("CHUNKING SUMMARY")
print("=" * 60)

successful = sum(1 for r in results if r["success"])
total = len(results)
total_chunks = sum(r["chunks_count"] for r in results)

print(f"\nTotal files: {total}")
print(f"Successful: {successful}")
print(f"Failed: {total - successful}")
print(f"Total chunks created: {total_chunks}")

print("\n" + "-" * 60)
print("Files processed:")
print("-" * 60)

for result in results:
    status = "✓" if result["success"] else "✗"
    filename = Path(result["file"]).name
    print(f"{status} {filename}: {result['chunks_count']} chunks")


CHUNKING SUMMARY

Total files: 4
Successful: 4
Failed: 0
Total chunks created: 131

------------------------------------------------------------
Files processed:
------------------------------------------------------------
✓ hypertension_medlineplus_overview.md: 41 chunks
✓ diabetes_medlineplus_overview.md: 46 chunks
✓ hypertension_medlineplus_treatment.md: 21 chunks
✓ diabetes_medlineplus_treatment.md: 23 chunks


In [13]:
# Display sample chunk for inspection
if all_chunks:
    print("\n" + "=" * 60)
    print("SAMPLE CHUNK")
    print("=" * 60)
    sample = all_chunks[0]
    print(f"\nChunk ID: {sample['chunk_id']}")
    print(f"Source: {sample['metadata']['filename']}")
    print(f"Type: {sample['metadata'].get('type', 'N/A')}")
    print(f"Condition: {sample['metadata'].get('condition', 'N/A')}")
    print(f"\nText preview (first 200 chars):")
    print("-" * 60)
    print(sample['text'][:200] + "..." if len(sample['text']) > 200 else sample['text'])
    print(f"\nFull chunk length: {len(sample['text'])} characters")


SAMPLE CHUNK

Chunk ID: hypertension_medlineplus_overview_chunk_0000
Source: hypertension_medlineplus_overview.md
Type: conditions
Condition: hypertension

Text preview (first 200 chars):
------------------------------------------------------------
# High Blood Pressure

Also called: Benign essential hypertension, Essential hypertension, HBP, HTN, Hypertension

On this page

### Basics

- [Summary](#summary)
- [Start Here](#cat_51)
- [Symptoms](...

Full chunk length: 815 characters
