# Step 2: Page Classification with Boundary Detection

This notebook demonstrates how to classify document pages and detect page-level document boundaries using the `multimodalPageBoundaryClassification` method.

**Key Features:**
- Multimodal classification using both text and images
- Document boundary detection (start/continue)
- Section creation based on both document type changes AND boundary markers

**Inputs:**
- Document object with OCR results from Step 1
- Configuration with boundary detection enabled

**Outputs:**
- Document with classifications and boundary markers
- Sections respecting document boundaries

## 1. Load Libraries and Configure Logging

In [None]:
import os
import json
import yaml
import time
import logging
from pathlib import Path

# Import IDP libraries
from idp_common.classification.service import ClassificationService
from idp_common.models import Document, Status

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('idp_common.classification').setLevel(logging.INFO)
logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)

print("Libraries imported successfully")

## 2. Load OCR Output and Configuration

In [None]:
# Load OCR output from Step 1
examples_dir = Path.cwd()
ocr_data_path = examples_dir / 'data' / 'ocr_output.json'

# Check if OCR output exists
if not ocr_data_path.exists():
    # Try alternative path from step1_ocr
    ocr_data_path = examples_dir / '.data' / 'step1_ocr' / 'document.json'
    
if not ocr_data_path.exists():
    raise FileNotFoundError(f"OCR output not found at {ocr_data_path}")

with open(ocr_data_path) as f:
    doc_data = json.load(f)
    
# Convert to Document object
if isinstance(doc_data, str):
    document = Document.from_json(doc_data)
else:
    document = Document.from_dict(doc_data) if 'id' in doc_data else Document.from_json(json.dumps(doc_data))

print(f"Loaded document: {document.id}")
print(f"Number of pages: {document.num_pages}")

# Load classification configuration with boundary detection
config_path = Path('../../config_library/pattern-2/lending-package-sample/config_multimodal_page_boundary.yaml')
with open(config_path) as f:
    config = yaml.safe_load(f)

# Display configuration details
classification_config = config.get('classification', {})
print("\nClassification Configuration:")
print(f"Method: {classification_config.get('classificationMethod')}")
print(f"Model: {classification_config.get('model', config.get('model_id'))}")
print(f"Temperature: {classification_config.get('temperature')}")

# Display available document classes
classes = config.get('classes', [])
print(f"\nAvailable Document Classes: {len(classes)}")
for cls in classes:
    print(f"- {cls['name']}: {cls['description'][:100]}...")

## 3. Initialize Classification Service

In [None]:
# Set AWS region if not already set
if 'AWS_REGION' not in os.environ:
    os.environ['AWS_REGION'] = 'us-west-2'

# Create classification service
service = ClassificationService(
    region=os.environ.get('AWS_REGION', 'us-west-2'), 
    config=config,
    backend='bedrock'  # Using Bedrock for multimodal classification
)

print("Classification service initialized")
print(f"Using method: {service.classification_method}")

## 4. Classify Document with Boundary Detection

In [None]:
# Classify the document
print("\nClassifying document with boundary detection...")
start_time = time.time()

try:
    classified_document = service.classify_document(document)
    classification_time = time.time() - start_time
    
    print(f"✅ Classification completed in {classification_time:.2f} seconds")
    print(f"Document status: {classified_document.status.value}")
    
except Exception as e:
    print(f"❌ Classification failed: {str(e)}")
    raise

## 5. Display Results with Boundary Information

In [None]:
# Show sections with boundary information
if classified_document.sections:
    print(f"\n📑 Detected {len(classified_document.sections)} sections:")
    for section in classified_document.sections:
        print(f"\nSection {section.section_id}:")
        print(f"  Type: {section.classification}")
        print(f"  Pages: {section.page_ids}")
        print(f"  Number of pages: {len(section.page_ids)}")
else:
    print("\n⚠️ No sections detected")

# Show page-level classifications with boundary markers
print("\n📄 Page-level classifications with boundaries:")
for page_id in sorted(classified_document.pages.keys(), key=lambda x: int(x) if x.isdigit() else float('inf')):
    page = classified_document.pages[page_id]
    
    # Get boundary information from page metadata if available
    boundary = "unknown"
    if hasattr(page, 'metadata') and page.metadata:
        boundary = page.metadata.get('document_boundary', 'unknown')
    
    # Check if page has classification result in Document model
    if hasattr(page, 'classification'):
        print(f"Page {page_id}: {page.classification} [boundary: {boundary}]")

## 6. Analyze Boundary Detection Results

In [None]:
# Analyze boundary detection effectiveness
print("\n🔍 Boundary Detection Analysis:")

# Count boundary markers
boundary_starts = 0
boundary_info = {}

for page_id, page in classified_document.pages.items():
    # Try to get boundary info from metadata
    boundary = "unknown"
    if hasattr(page, 'metadata') and page.metadata:
        boundary = page.metadata.get('document_boundary', 'unknown')
    
    boundary_info[page_id] = boundary
    if boundary == 'start':
        boundary_starts += 1

print(f"Number of 'start' boundaries detected: {boundary_starts}")
print(f"Number of sections created: {len(classified_document.sections) if classified_document.sections else 0}")

# Display boundary transitions
print("\n📊 Boundary Transitions:")
prev_type = None
for page_id in sorted(classified_document.pages.keys(), key=lambda x: int(x) if x.isdigit() else float('inf')):
    page = classified_document.pages[page_id]
    if hasattr(page, 'classification'):
        curr_type = page.classification
        boundary = boundary_info.get(page_id, 'unknown')
        
        if prev_type and prev_type != curr_type:
            print(f"  Type change at page {page_id}: {prev_type} → {curr_type} [boundary: {boundary}]")
        elif boundary == 'start':
            print(f"  Boundary marker at page {page_id}: {curr_type} [boundary: start]")
        
        prev_type = curr_type

# Verify boundary logic
if classified_document.sections and boundary_starts > 0:
    print("\n✅ Boundary detection is working - sections align with boundary markers")
else:
    print("\n⚠️ No boundary markers detected - sections based on document type changes only")

## 7. Save Results

In [None]:
# Create output directory
output_dir = examples_dir / 'data' / 'classification_boundary_output'
output_dir.mkdir(parents=True, exist_ok=True)

# Save classified document
output_path = output_dir / 'classified_document.json'
with open(output_path, 'w') as f:
    f.write(classified_document.to_json())
    
print(f"\n💾 Saved classified document to: {output_path}")

# Save classification summary
summary = {
    'document_id': classified_document.id,
    'classification_method': service.classification_method,
    'processing_time_seconds': classification_time,
    'num_sections': len(classified_document.sections) if classified_document.sections else 0,
    'sections': [
        {
            'section_id': section.section_id,
            'classification': section.classification,
            'page_ids': section.page_ids,
            'num_pages': len(section.page_ids)
        } for section in (classified_document.sections or [])
    ],
    'boundary_detection': {
        'enabled': True,
        'num_start_boundaries': boundary_starts,
        'boundary_info': boundary_info
    },
    'page_classifications': {
        page_id: {
            'classification': page.classification if hasattr(page, 'classification') else 'unknown',
            'boundary': boundary_info.get(page_id, 'unknown')
        } for page_id, page in classified_document.pages.items()
    }
}

summary_path = output_dir / 'classification_summary.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)
    
print(f"💾 Saved classification summary to: {summary_path}")

## 8. Summary

In [None]:
print("\n" + "="*60)
print("✅ BOUNDARY CLASSIFICATION COMPLETE")
print("="*60)
print(f"Document ID: {classified_document.id}")
print(f"Classification Method: {service.classification_method}")
print(f"Processing Time: {classification_time:.2f} seconds")
print(f"Sections Identified: {len(classified_document.sections) if classified_document.sections else 0}")
print(f"Boundary Markers Found: {boundary_starts}")
print(f"\n📁 Output saved to: {output_dir}")
print("\n📌 Next step: Run extraction on the classified sections")