# Step 1: OCR Processing

This notebook performs OCR processing on the document using AWS Textract.

**Inputs:**
- Document object from Step 0
- OCR configuration

**Outputs:**
- Document with OCR results (text and images extracted from PDF)
- Page-level text and image URIs

## 1. Load Previous Step Data

In [None]:
import os
import json
import time
import logging
from pathlib import Path

# Import IDP libraries
from idp_common.models import Document, Status
from idp_common import ocr

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('idp_common.ocr.service').setLevel(logging.INFO)
logging.getLogger('textractor').setLevel(logging.WARNING)

print("Libraries imported successfully")

In [None]:
# Load document from previous step
setup_data_dir = Path(".data/step0_setup")

# Load document object from JSON
document_path = setup_data_dir / "document.json"
with open(document_path, 'r') as f:
    document = Document.from_json(f.read())

# Load configuration directly from config files
import yaml
config_dir = Path("config")
CONFIG = {}

# Load each configuration file
config_files = [
    "ocr.yaml",
]

for config_file in config_files:
    config_path = config_dir / config_file
    if config_path.exists():
        with open(config_path, 'r') as f:
            file_config = yaml.safe_load(f)
            CONFIG.update(file_config)
        print(f"Loaded {config_file}")
    else:
        print(f"Warning: {config_file} not found")

# Load environment info
env_path = setup_data_dir / "environment.json"
with open(env_path, 'r') as f:
    env_info = json.load(f)

# Set environment variables
os.environ['AWS_REGION'] = env_info['region']
os.environ['METRIC_NAMESPACE'] = 'IDP-Modular-Pipeline'

print(f"Loaded document: {document.id}")
print(f"Document status: {document.status.value}")
print(f"Region: {env_info['region']}")
print(f"Loaded configuration sections: {list(CONFIG.keys())}")

## 2. Configure OCR Service

In [None]:
# Create OCR service using new simplified pattern
ocr_service = ocr.OcrService(
    region=env_info['region'],
    config=CONFIG  # Pass entire config dictionary
)

print("OCR service initialized")

# Display configuration for debugging
ocr_config = CONFIG.get('ocr', {})
print("\nOCR Configuration:")
print(json.dumps(ocr_config, indent=2))

# Show backend being used
backend = ocr_config.get('backend', 'textract')
print(f"\nUsing backend: {backend}")

## 3. Process Document with OCR

In [None]:
# Process document with OCR
print("Processing document with OCR...")
start_time = time.time()

document = ocr_service.process_document(document)

ocr_time = time.time() - start_time
print(f"OCR processing completed in {ocr_time:.2f} seconds")
print(f"Document status: {document.status.value}")
print(f"Number of pages processed: {document.num_pages}")

## 4. Display OCR Results

In [None]:
# Show pages information
print("Processed pages:")
for page_id, page in document.pages.items():
    print(f"Page {page_id}:")
    print(f"  Image URI: {page.image_uri}")
    print(f"  Raw Text URI: {page.raw_text_uri}")
    print(f"  Parsed Text URI: {page.parsed_text_uri}")
    print()

print("OCR Metering:")
print(json.dumps(document.metering, indent=2))

## 5. Save Results for Next Step

In [None]:
# Create data directory for this step
data_dir = Path(".data/step1_ocr")
data_dir.mkdir(parents=True, exist_ok=True)

# Save updated document object as JSON
document_path = data_dir / "document.json"
with open(document_path, 'w') as f:
    f.write(document.to_json())

# Save configuration (pass through)
config_path = data_dir / "config.json"
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=2)

# Save environment info (pass through)
env_path = data_dir / "environment.json"
with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

print(f"Saved document to: {document_path}")
print(f"Saved configuration to: {config_path}")
print(f"Saved environment info to: {env_path}")


## 6. Summary

In [None]:
print("=== Step 1: OCR Processing Complete ===")
print(f"✅ Document processed: {document.id}")
print(f"✅ Pages extracted: {document.num_pages}")
print(f"✅ Processing time: {ocr_time:.2f} seconds")
print(f"✅ Backend used: {backend}")
print(f"✅ Data saved to: .data/step1_ocr/")
print("\n📌 Next step: Run step2_classification.ipynb")