# Guided OCR with Format Correction (Phase 2)

This notebook implements Phase 2 of our enhanced OCR process: applying the document structure and correcting specialized notation.

## Steps:
1. Load the structure data from Phase 1
2. Process each page with its corresponding image and OCR text
3. Apply heading structure based on structure data
4. Correct specialized notation (LaTeX, equations, Greek symbols, etc.)
5. Assemble the corrected pages into a final document

## Setup

First, let's import the necessary libraries and set up our environment.

In [None]:
import os
import json
import base64
from pathlib import Path
from typing import List, Dict, Optional, Any, Tuple
import anthropic
import re
from pdf2image import convert_from_path
from pydantic import BaseModel, Field
from IPython.display import display, Image, Markdown
from datetime import datetime
import mistralai  # Optional - if using Mistral for OCR
import pickle  # For caching intermediate results

# Set your API key
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")

if not ANTHROPIC_API_KEY:
    print("Warning: No Anthropic API key found. Set the ANTHROPIC_API_KEY environment variable.")
    # Uncomment and set directly if needed
    # ANTHROPIC_API_KEY = "your_api_key_here"

# Optional: Mistral API key for OCR if not already done
MISTRAL_API_KEY = os.environ.get("MISTRAL_OCR")

## Data Models

Let's define models for the document structure and processing.

In [None]:
class HeadingElement(BaseModel):
    """A heading element in the document"""
    text: str = Field(..., description="The heading text")
    level: int = Field(..., description="Heading level (1 for main headings, 2 for subheadings, etc.)")
    page: int = Field(..., description="Page number containing the heading (1-indexed)")
    position: Optional[Dict[str, int]] = Field(None, description="Position coordinates on page (x, y, width, height)")

class DocumentStructure(BaseModel):
    """Structure of the document extracted from images"""
    headings: List[HeadingElement] = Field(default_factory=list, description="All headings in the document")
    total_pages: int = Field(..., description="Total number of pages in the document")
    document_title: Optional[str] = Field(None, description="Title of the document")

class PageContent(BaseModel):
    """Content of a single page in the document"""
    page_number: int = Field(..., description="Page number (1-indexed)")
    raw_text: str = Field(..., description="Raw OCR text from the page")
    headings: List[HeadingElement] = Field(default_factory=list, description="Headings on this page")
    corrected_text: Optional[str] = Field(None, description="Corrected text with proper structure and formatting")

class ProcessedDocument(BaseModel):
    """The fully processed document with structure and corrected content"""
    title: Optional[str] = Field(None, description="Document title")
    pages: List[PageContent] = Field(default_factory=list, description="Content of each page")
    full_text: Optional[str] = Field(None, description="Complete assembled document text")
    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata about the document")

## PDF and Image Processing Functions

Functions to handle PDF conversion and image processing.

In [None]:
def convert_pdf_to_images(pdf_path, dpi=200, cache=True):
    """
    Convert PDF to a list of PIL Image objects.
    
    Args:
        pdf_path: Path to the PDF file
        dpi: Resolution for the images (higher = better quality but larger size)
        cache: Whether to cache the results
        
    Returns:
        List of PIL Image objects
    """
    # Create cache directory if it doesn't exist
    cache_dir = Path("cache")
    cache_dir.mkdir(exist_ok=True)
    
    # Generate cache filename based on PDF path and DPI
    pdf_name = Path(pdf_path).stem
    cache_file = cache_dir / f"{pdf_name}_images_{dpi}.pkl"
    
    # Check if cached version exists
    if cache and cache_file.exists():
        print(f"Loading images from cache: {cache_file}")
        with open(cache_file, "rb") as f:
            return pickle.load(f)
    
    print(f"Converting PDF to images: {pdf_path}")
    images = convert_from_path(pdf_path, dpi=dpi)
    print(f"Converted {len(images)} pages")
    
    # Save to cache if enabled
    if cache:
        print(f"Saving images to cache: {cache_file}")
        with open(cache_file, "wb") as f:
            pickle.dump(images, f)
    
    return images

def encode_image_to_base64(image):
    """
    Encode a PIL Image to base64 for API transmission.
    
    Args:
        image: PIL Image object
        
    Returns:
        Base64 encoded string
    """
    import io
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

def load_mistral_ocr_json(ocr_json_path):
    """
    Load OCR results from Mistral's JSON format.
    
    Args:
        ocr_json_path: Path to the OCR JSON file
        
    Returns:
        List of dictionaries with page information including text and images
    """
    with open(ocr_json_path, "r", encoding="utf-8") as f:
        ocr_data = json.load(f)
    
    pages = []
    if "pages" in ocr_data:
        for page_data in ocr_data["pages"]:
            # Extract the page index and markdown content
            # The format is "index=X markdown='content'" with possible images list
            match = re.search(r'index=(\d+) markdown=[\'"](.+?)[\'"](?= images=|\Z)', page_data, re.DOTALL)
            if match:
                page_index = int(match.group(1))
                markdown_content = match.group(2)
                
                # Extract image info if present
                images = []
                img_match = re.search(r'images=\[(.*?)\]', page_data)
                if img_match and img_match.group(1):
                    # Parse the image information
                    img_data = img_match.group(1)
                    img_objects = re.finditer(r'OCRImageObject\(id=[\'"](.*?)[\'"](.*?)\)', img_data)
                    for img_obj in img_objects:
                        img_id = img_obj.group(1)
                        coords_str = img_obj.group(2)
                        
                        # Extract coordinates
                        top_left_x = re.search(r'top_left_x=(\d+)', coords_str)
                        top_left_y = re.search(r'top_left_y=(\d+)', coords_str)
                        bottom_right_x = re.search(r'bottom_right_x=(\d+)', coords_str)
                        bottom_right_y = re.search(r'bottom_right_y=(\d+)', coords_str)
                        
                        images.append({
                            "id": img_id,
                            "coordinates": {
                                "top_left_x": int(top_left_x.group(1)) if top_left_x else None,
                                "top_left_y": int(top_left_y.group(1)) if top_left_y else None,
                                "bottom_right_x": int(bottom_right_x.group(1)) if bottom_right_x else None,
                                "bottom_right_y": int(bottom_right_y.group(1)) if bottom_right_y else None,
                            }
                        })
                
                # Get page dimensions if present
                dimensions = {}
                dim_match = re.search(r'dimensions=OCRPageDimensions\(([^)]+)\)', page_data)
                if dim_match:
                    dim_str = dim_match.group(1)
                    dpi_match = re.search(r'dpi=(\d+)', dim_str)
                    height_match = re.search(r'height=(\d+)', dim_str)
                    width_match = re.search(r'width=(\d+)', dim_str)
                    
                    dimensions = {
                        "dpi": int(dpi_match.group(1)) if dpi_match else None,
                        "height": int(height_match.group(1)) if height_match else None,
                        "width": int(width_match.group(1)) if width_match else None
                    }
                
                pages.append({
                    "index": page_index,
                    "markdown": markdown_content,
                    "images": images,
                    "dimensions": dimensions
                })
    
    # Sort pages by index
    pages.sort(key=lambda x: x["index"])
    return pages

In [ ]:
def get_ocr_text(pdf_path, api_key=None, use_cached=True):
    """
    Get OCR text from a PDF, either from cache or from Mistral API.
    Prioritizes loading from the OCR JSON file if it exists.
    
    Args:
        pdf_path: Path to the PDF file
        api_key: Mistral API key (if not using cached results)
        use_cached: Whether to use cached results if available
        
    Returns:
        Tuple of (list of page dictionaries, full combined text)
    """
    # Check for cached OCR JSON file
    ocr_json_path = Path(pdf_path).with_suffix(".ocr.json")
    if use_cached and ocr_json_path.exists():
        print(f"Using cached OCR data from: {ocr_json_path}")
        pages = load_mistral_ocr_json(ocr_json_path)
        
        # Combine all pages into a single text
        full_text = "\n\n".join(page["markdown"] for page in pages)
        return pages, full_text
    
    # Check for cached markdown file as fallback
    markdown_path = Path(pdf_path).with_suffix(".md")
    if use_cached and markdown_path.exists():
        print(f"Using cached OCR text from: {markdown_path}")
        with open(markdown_path, "r", encoding="utf-8") as f:
            text = f.read()
            # Create a simulated page structure
            pages = split_text_into_pages(text)
            return pages, text
    
    # If not using cache or no cache available, use Mistral API
    if not api_key:
        raise ValueError("No Mistral API key provided and no cached text available")
        
    print(f"Performing OCR on: {pdf_path}")
    
    # Use Mistral API to perform OCR
    # This is a simplified version - see the full implementation in Phase 1
    client = mistralai.Mistral(api_key=api_key)
    
    # Upload file
    uploaded_pdf = client.files.upload(
        file={
            "file_name": Path(pdf_path).name,
            "content": open(pdf_path, "rb"),
        },
        purpose="ocr"
    )
    
    # Get signed URL
    signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
    
    # Process OCR
    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": signed_url.url,
        }
    )
    
    # Save raw response for future use
    with open(ocr_json_path, "w", encoding="utf-8") as f:
        json.dump(ocr_response, f, indent=2, default=str)
    
    # Load the saved JSON (ensures consistency with cached loads)
    pages = load_mistral_ocr_json(ocr_json_path)
    full_text = "\n\n".join(page["markdown"] for page in pages)
    
    # Also save the markdown for compatibility
    with open(markdown_path, "w", encoding="utf-8") as f:
        f.write(full_text)
    
    return pages, full_text

def split_text_into_pages(full_text):
    """
    Split text into simulated pages.
    Used as a fallback when we only have the plain text without page information.
    
    Args:
        full_text: Complete OCR text from the document
        
    Returns:
        List of page dictionaries with markdown content
    """
    # Try to detect page breaks based on common patterns
    page_texts = re.split(r'\n\s*[-]{3,}\s*[Pp]age\s+\d+\s*[-]{3,}\s*\n', full_text)
    
    # If that didn't work well, try another approach
    if len(page_texts) <= 1:
        # Try to find major section breaks (lines starting with # )
        page_texts = re.split(r'\n\s*#\s+[A-Z]', full_text)
        
        # Put the # back at the beginning of each section (except the first)
        for i in range(1, len(page_texts)):
            page_texts[i] = '# ' + page_texts[i]
        
        # If still not good, just split into roughly equal parts
        if len(page_texts) <= 1:
            # Estimate 2000 characters per page
            chars_per_page = 2000
            page_texts = [full_text[i:i+chars_per_page] 
                          for i in range(0, len(full_text), chars_per_page)]
    
    # Convert to page dictionaries
    pages = []
    for i, text in enumerate(page_texts):
        pages.append({
            "index": i,
            "markdown": text.strip(),
            "images": [],
            "dimensions": {}
        })
    
    return pages

## Structure Loading and Processing

Functions to load and process the structure data from Phase 1.

In [None]:
def load_document_structure(structure_path):
    """
    Load the document structure from a JSON file created in Phase 1.
    
    Args:
        structure_path: Path to the structure JSON file
        
    Returns:
        DocumentStructure object
    """
    with open(structure_path, "r", encoding="utf-8") as f:
        structure_data = json.load(f)
    
    return DocumentStructure(**structure_data)

def get_page_headings(structure, page_number):
    """
    Get headings for a specific page from the document structure.
    
    Args:
        structure: DocumentStructure object
        page_number: Page number (1-indexed)
        
    Returns:
        List of HeadingElement objects for the specified page
    """
    return [h for h in structure.headings if h.page == page_number]

## Correction Functions with Claude API

Functions to process pages with Claude, applying structure and fixing notation.

In [ ]:
def correct_page_content(page_number, image, raw_text, headings, api_key, document_title=None, context=None):
    """
    Process a single page with Claude to apply structure and correct formatting.
    
    Args:
        page_number: Page number (1-indexed)
        image: PIL Image object of the page
        raw_text: Raw OCR text from the page
        headings: List of HeadingElement objects for this page
        api_key: Anthropic API key
        document_title: Optional document title for context
        context: Optional surrounding context (text from adjacent pages)
        
    Returns:
        Corrected text with proper structure and formatting
    """
    client = anthropic.Anthropic(api_key=api_key)
    
    # Prepare the heading information string
    heading_info = "\n".join([f"- '{h.text}': level {h.level} (# {'#' * (h.level - 1)})" for h in headings])
    if not heading_info:
        heading_info = "No headings detected on this page."
    
    # Encode the image for API transmission
    image_base64 = encode_image_to_base64(image)
    
    # Prepare the instruction text
    instruction = f"""
    I'm sending you page {page_number} of a scientific document{' titled "' + document_title + '"' if document_title else ''}. 
    
    The following headings have been identified on this page:
    {heading_info}
    
    Here's the raw OCR text extracted from this page:
    ```
    {raw_text}
    ```
    
    Please:
    
    1. Apply the correct markdown heading levels to all headings identified above.
    
    2. Fix LaTeX formatting consistently throughout the document:
       - Make sure all citations have proper LaTeX syntax: e.g., "$^{{1,2}}$" not "^1,2"
       - Ensure all mathematical expressions are properly wrapped in $ symbols
       - Fix superscripts and subscripts with proper LaTeX syntax
       - Make sure there are no spaces inside the LaTeX delimiters for citations
       - Ensure all citation numbers are properly formatted with curly braces: "$^{{1}}$" not "$^1$"
    
    3. Fix specific LaTeX issues:
       - Fix expressions like "${{ }}^{{21}}$" to "$^{{21}}$"
       - Fix histone notations like "(H3-H4)2" to "$(H3-H4)_2$"
       - Fix spacing in math expressions by removing excess spaces
       - Fix nucleosome size notation like "~147 bp" to "~147 \\text{{bp}}"
       - Ensure all units are in \\text{{}} format: "\\text{{kb}}", "\\text{{bp}}", etc.
    
    4. Fix formatting of gene names, protein names, and other scientific notation:
       - Italicize gene names when appropriate
       - Make sure protein complexes are formatted correctly (e.g., "Mcm2-7")
    
    5. Preserve paragraph structure and ensure text flows properly without any strange formatting artifacts.
    
    Return ONLY the corrected markdown text without any explanations or additional commentary.
    """
    
    # Add context if provided
    if context:
        instruction += f"""
        
        For context, here's text from adjacent pages (do not include this in your response):
        ```
        {context}
        ```
        """
    
    # Prepare message content
    content = [
        {"type": "text", "text": instruction},
        {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": image_base64
            }
        }
    ]
    
    # Call the API
    response = client.messages.create(
        model="claude-3-opus-20240229",  # Using the highest quality model for better text processing
        max_tokens=4000,
        temperature=0,  # Use 0 for consistent, deterministic output
        system="You are an expert at scientific document formatting, especially LaTeX notation in markdown. Your task is to ensure all LaTeX formatting is consistent and correct throughout the document. This includes ensuring all citations are properly formatted with $ delimiters and curly braces, all mathematical expressions are properly wrapped in $ symbols, and all superscripts and subscripts use proper LaTeX syntax. Pay special attention to citation numbers, ensuring they're formatted as $^{N}$ consistently. Only return the corrected text without explanations.",
        messages=[
            {
                "role": "user",
                "content": content
            }
        ]
    )
    
    # Post-process the response to ensure consistent LaTeX formatting
    corrected_text = response.content[0].text.strip()
    
    # Ensure citations are properly formatted
    # Replace ^{N} without $ with $^{N}$
    corrected_text = re.sub(r'([^$])(\^{[0-9,\s-]+})', r'\1$\2$', corrected_text)
    
    # Ensure there are no spaces inside citation LaTeX
    corrected_text = re.sub(r'\$\^\{\s*([0-9,\s-]+)\s*\}\$', r'$^{\1}$', corrected_text)
    
    # Remove spaces between citation numbers
    corrected_text = re.sub(r'\$\^\{([0-9]+)\s*,\s*([0-9,\s-]+)\}\$', r'$^{\1,\2}$', corrected_text)
    
    return corrected_text

## Document Assembly and Processing

Functions to process the entire document and assemble the corrected pages.

In [ ]:
def process_document_with_structure(pdf_path, structure_path, output_path=None, max_pages=None):
    """
    Process a PDF document using the structure from Phase 1, correcting formatting and notation.
    
    Args:
        pdf_path: Path to the PDF file
        structure_path: Path to the structure JSON file from Phase 1
        output_path: Path to save the output markdown (default: same as pdf with _structured.md)
        max_pages: Maximum number of pages to process (None = all)
        
    Returns:
        ProcessedDocument object
    """
    # Set default output path if not provided
    if output_path is None:
        output_path = Path(pdf_path).with_stem(f"{Path(pdf_path).stem}_structured").with_suffix(".md")
    
    # Load the document structure
    structure = load_document_structure(structure_path)
    
    # Convert PDF to images
    images = convert_pdf_to_images(pdf_path)
    
    # Limit the number of pages if specified
    if max_pages is not None:
        images = images[:max_pages]
        print(f"Limited to first {max_pages} pages")
    
    # Get OCR text using direct JSON loading
    ocr_pages, full_text = get_ocr_text(pdf_path, api_key=MISTRAL_API_KEY)
    
    # Make sure we have the same number of OCR pages and images
    if len(ocr_pages) != len(images):
        print(f"Warning: Number of OCR pages ({len(ocr_pages)}) doesn't match number of images ({len(images)})")
        # Adjust to the smaller number
        min_pages = min(len(ocr_pages), len(images))
        ocr_pages = ocr_pages[:min_pages]
        images = images[:min_pages]
        print(f"Proceeding with {min_pages} pages")
    
    # Process each page
    processed_pages = []
    corrected_texts = []
    
    for i, (image, page_data) in enumerate(zip(images, ocr_pages)):
        page_number = i + 1  # 1-indexed
        print(f"Processing page {page_number}/{len(images)}...")
        
        # Get headings for this page
        page_headings = get_page_headings(structure, page_number)
        
        # Get context from adjacent pages
        context = None
        if i > 0 and i < len(ocr_pages) - 1:
            context = f"Previous page: {ocr_pages[i-1]['markdown'][:500]}...\n\nNext page: {ocr_pages[i+1]['markdown'][:500]}..."
        elif i > 0:
            context = f"Previous page: {ocr_pages[i-1]['markdown'][:500]}..."
        elif i < len(ocr_pages) - 1:
            context = f"Next page: {ocr_pages[i+1]['markdown'][:500]}..."
        
        # Extract image references for this page
        page_images = page_data.get("images", [])
        image_info = ""
        if page_images:
            image_info = "\n\nThis page contains the following images:\n"
            for img in page_images:
                image_info += f"- Image ID: {img['id']}\n"
                if "coordinates" in img:
                    coords = img["coordinates"]
                    image_info += f"  Located at position: ({coords.get('top_left_x')}, {coords.get('top_left_y')}) to ({coords.get('bottom_right_x')}, {coords.get('bottom_right_y')})\n"
        
        # Correct the page content
        corrected_text = correct_page_content(
            page_number, 
            image, 
            page_data["markdown"] + image_info, 
            page_headings, 
            ANTHROPIC_API_KEY,
            document_title=structure.document_title,
            context=context
        )
        
        # Create PageContent object
        page_content = PageContent(
            page_number=page_number,
            raw_text=page_data["markdown"],
            headings=[HeadingElement(
                text=h.text,
                level=h.level,
                page=h.page,
                position=h.position
            ) for h in page_headings],
            corrected_text=corrected_text
        )
        
        processed_pages.append(page_content)
        corrected_texts.append(corrected_text)
        
        # Optional: Save intermediate results
        cache_dir = Path("cache")
        cache_dir.mkdir(exist_ok=True)
        with open(cache_dir / f"{Path(pdf_path).stem}_page_{page_number}.md", "w", encoding="utf-8") as f:
            f.write(corrected_text)
    
    # Assemble the final document
    assembled_text = "\n\n".join(corrected_texts)
    
    # Create ProcessedDocument object
    document = ProcessedDocument(
        title=structure.document_title,
        pages=processed_pages,
        full_text=assembled_text,
        metadata={
            "source_file": str(pdf_path),
            "structure_file": str(structure_path),
            "processed_date": datetime.now().isoformat(),
            "total_pages": len(processed_pages)
        }
    )
    
    # Save the assembled document
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(assembled_text)
    
    print(f"\nAssembled document saved to: {output_path}")
    
    # Save the full processed document data
    json_output_path = Path(output_path).with_suffix(".json")
    with open(json_output_path, "w", encoding="utf-8") as f:
        f.write(document.model_dump_json(indent=2))
    
    print(f"Full document data saved to: {json_output_path}")
    
    return document

## Process a Document

Now let's use these functions to process a sample document.

In [None]:
# Enter the path to your PDF file and structure file from Phase 1
pdf_path = "R35_MIRA.pdf"  # Replace with your PDF file
structure_path = f"{Path(pdf_path).stem}_structure.json"  # From Phase 1

# Process the document (limit to first 3 pages for testing)
# Remove max_pages parameter to process the entire document
if ANTHROPIC_API_KEY and Path(structure_path).exists():
    try:
        document = process_document_with_structure(
            pdf_path=pdf_path,
            structure_path=structure_path,
            max_pages=3  # Comment this out to process all pages
        )
        
        # Display summary of the processed document
        print(f"\nProcessed Document Summary:")
        print(f"- Document title: {document.title}")
        print(f"- Total pages processed: {len(document.pages)}")
        print(f"- Output size: {len(document.full_text)} characters")
        
        # Display a preview of the results
        preview_length = min(500, len(document.full_text))
        print(f"\nPreview of the first {preview_length} characters:")
        display(Markdown(document.full_text[:preview_length] + "..."))
        
    except Exception as e:
        print(f"Error processing document: {e}")
        import traceback
        traceback.print_exc()
else:
    if not ANTHROPIC_API_KEY:
        print("Cannot process document: No API key provided.")
    if not Path(structure_path).exists():
        print(f"Cannot process document: Structure file not found at {structure_path}")

## Compare Original and Corrected Text

Let's visualize the improvements by comparing original and corrected text.

In [ ]:
def compare_original_and_corrected(page_number, document):
    """
    Display a side-by-side comparison of original and corrected text for a page.
    
    Args:
        page_number: Page number to compare (1-indexed)
        document: ProcessedDocument object or dictionary loaded from JSON
    """
    # Find the page - handle both object and dictionary formats
    if hasattr(document, 'pages'):
        # Case when document is an object
        page = next((p for p in document.pages if p.page_number == page_number), None)
    else:
        # Case when document is a dictionary (loaded from JSON)
        pages = document.get('pages', [])
        page = next((p for p in pages if p.get('page_number') == page_number), None)
    
    if not page:
        print(f"Page {page_number} not found in the processed document.")
        return
    
    # Get text content - handle both object and dictionary formats
    if hasattr(page, 'raw_text'):
        raw_text = page.raw_text
        corrected_text = page.corrected_text
    else:
        raw_text = page.get('raw_text', '')
        corrected_text = page.get('corrected_text', '')
    
    # Display comparison
    print(f"Page {page_number} - Original vs. Corrected Text:\n")
    
    # Create HTML for side-by-side display
    html = f"""
    <div style="display: flex;">
        <div style="flex: 1; padding: 10px; border: 1px solid #ccc;">
            <h3>Original Text</h3>
            <pre>{raw_text}</pre>
        </div>
        <div style="flex: 1; padding: 10px; border: 1px solid #ccc;">
            <h3>Corrected Text</h3>
            <pre>{corrected_text}</pre>
        </div>
    </div>
    """
    
    from IPython.display import HTML
    display(HTML(html))

    # Also show the rendered markdown for corrected text
    print("\nRendered Corrected Text:")
    display(Markdown(corrected_text))

# If we've processed a document, compare the first page
json_output_path = Path(pdf_path).with_stem(f"{Path(pdf_path).stem}_structured").with_suffix(".json")
if json_output_path.exists():
    with open(json_output_path, "r", encoding="utf-8") as f:
        doc_data = json.load(f)
    
    # Compare the first page
    if 'pages' in doc_data and len(doc_data['pages']) > 0:
        compare_original_and_corrected(1, doc_data)
else:
    print(f"Processed document not found at {json_output_path}")

## Conclusion

In this Phase 2 notebook, we've built a system that:

1. Uses the structure information from Phase 1
2. Processes each page with both its image and OCR text
3. Applies the correct heading structure
4. Corrects specialized notation (LaTeX, equations, Greek symbols)
5. Assembles the pages into a well-structured final document

The approach uses Claude's multimodal capabilities to understand both the visual appearance and text content of each page, resulting in much better structure and formatting than traditional OCR alone.