# Handwritten OCR Playground

Interactive notebook for testing and debugging the handwritten notes OCR pipeline.

## 1. Setup & Imports

In [None]:
import sys
import os
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(Path.cwd().parent / ".env")

# Add parent directory to path for imports
sys.path.insert(0, str(Path.cwd().parent))

print("Imports loaded successfully!")
print(f"HF_TOKEN set: {'Yes' if os.getenv('HF_TOKEN') and os.getenv('HF_TOKEN') != 'your_token_here' else 'No - add your token to .env'}")

## 3. Test with a Single Image

Update the `image_path` to point to your handwritten note image.

In [None]:
# Set the path to your test image
image_path = Path.cwd().parent / "data" / "input" / "IMG_4737.jpeg"  # Update this path

# Check if image exists
if image_path.exists():
    print(f"✓ Image found: {image_path.name}")
else:
    print(f"✗ Image not found: {image_path}")
    print("\nAvailable images in data/input:")
    input_dir = Path.cwd().parent / "data" / "input"
    for f in input_dir.glob("*"):
        if f.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
            print(f"  - {f.name}")

In [None]:
# Display the image
if image_path.exists():
    img = Image.open(image_path)
    plt.figure(figsize=(10, 6))
    plt.imshow(img)
    plt.title(f"Input Image: {image_path.name}")
    plt.axis('off')
    plt.show()
    
    print(f"Image size: {img.size}")
    print(f"Image mode: {img.mode}")

## 🆕 Try GOT-OCR2_0 - Full Page OCR Model

GOT-OCR2_0 from StepFun AI is designed for full-page document OCR, which should handle your notebook page much better than TrOCR.

In [None]:
# Load the GOT-OCR-2.0-hf model (HuggingFace Transformers version)
# This version properly supports MPS/CPU unlike the original
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

# Determine device - MPS for Apple Silicon, CUDA for NVIDIA, else CPU
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Using device: {device}")
print("Loading GOT-OCR-2.0-hf model...")

model_got = AutoModelForImageTextToText.from_pretrained(
    "stepfun-ai/GOT-OCR-2.0-hf", 
    torch_dtype=torch.bfloat16 if device != "cpu" else torch.float32,
    device_map=device
)
processor_got = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")

print("GOT-OCR-2.0-hf model loaded!")

In [None]:
# Run OCR on the full page image with GOT-OCR-2.0-hf
print(f"Processing image: {image_path}")
print("-" * 50)

# Load and process the image
inputs = processor_got(str(image_path), return_tensors="pt").to(device)

# Generate text
generate_ids = model_got.generate(
    **inputs,
    do_sample=False,
    tokenizer=processor_got.tokenizer,
    stop_strings="<|im_end|>",
    max_new_tokens=4096,
)

# Decode the output
result_got = processor_got.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)

print("\n" + "=" * 50)
print("GOT-OCR-2.0 FULL PAGE RESULT:")
print("=" * 50)
print(result_got)
print("=" * 50)

## 🔬 Try olmOCR-2 - Allen AI's Document OCR

olmOCR from Allen AI is another powerful document OCR model, fine-tuned from Qwen2-VL. It's designed for high-quality document understanding and text extraction.

In [None]:
# Load olmOCR-2 model
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

print("Loading olmOCR-2 model...")

OLMOCR_MODEL = "allenai/olmOCR-2-7B-1025-FP8"

# Both processor and model from the same repo
processor_olm = AutoProcessor.from_pretrained(OLMOCR_MODEL)
model_olm = AutoModelForImageTextToText.from_pretrained(
    OLMOCR_MODEL, 
    torch_dtype=torch.bfloat16,
    device_map=device
)

print(f"✓ olmOCR-2 (FP8) loaded on {device}!")

In [None]:
# Run OCR with olmOCR-2
print(f"Processing image: {image_path}")
print("-" * 50)

# Build messages with local image
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": str(image_path)},
            {"type": "text", "text": "Extract and return all the text from this handwritten document."}
        ]
    },
]

# Process with the bundled processor
inputs = processor_olm.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(device)

# Generate
with torch.no_grad():
    output = model_olm.generate(**inputs, max_new_tokens=2048)

# Decode
result_olm = processor_olm.decode(output[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

print("\n" + "=" * 50)
print("olmOCR-2 RESULT:")
print("=" * 50)
print(result_olm)
print("=" * 50)

## 📄 Save OCR Results

Save the transcribed text to organized .txt and .md files in the output directory.

In [None]:
from datetime import datetime

def save_ocr_result(text: str, source_image: Path, output_dir: Path, formats: list = ["txt", "md"]):
    """
    Save OCR result to organized text and/or markdown files.
    
    Args:
        text: The transcribed text
        source_image: Path to the source image
        output_dir: Directory to save output files
        formats: List of formats to save ("txt", "md", or both)
    
    Returns:
        Dictionary of saved file paths
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Create base filename from source image
    base_name = source_image.stem
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    saved_files = {}
    
    if "txt" in formats:
        # Save as plain text
        txt_path = output_dir / f"{base_name}.txt"
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(text)
        saved_files["txt"] = txt_path
        print(f"✓ Saved: {txt_path}")
    
    if "md" in formats:
        # Save as markdown with metadata
        md_path = output_dir / f"{base_name}.md"
        md_content = f"""# OCR Transcription: {source_image.name}

**Source:** `{source_image.name}`  
**Processed:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}  
**Model:** GOT-OCR-2.0-hf

---

## Transcribed Text

{text}

---

*Generated by Handwritten OCR Pipeline*
"""
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(md_content)
        saved_files["md"] = md_path
        print(f"✓ Saved: {md_path}")
    
    return saved_files

# Define output directory
output_dir = Path.cwd().parent / "data" / "output"
print(f"Output directory: {output_dir}")

In [None]:
# Save the current OCR result
saved = save_ocr_result(
    text=result_got,
    source_image=image_path,
    output_dir=output_dir,
    formats=["txt", "md"]
)

print(f"\n📁 Files saved to: {output_dir}")