# Hybrid Invoice Extraction: Open-Source + Claude API

**Strategy:**
1. Try open-source Qwen2-VL first (free)
2. Use Claude API only for failures (minimal cost)
3. Merge results for 100% accuracy

**Expected cost:** ~$0.20 (vs $0.50 for all Claude API)

## 0. Setting your Name and Email

In [None]:
# WRITE YOUR NAME_SURNAME HERE, AS WELL AS YOUR EMAIL WITH WHICH YOU LOGGED IN INTO CELONIS
MY_NAME = 'SCHUMANN'
MY_EMAIL = 'schumann.marvin@outlook.com'

# Your Claude API key
CLAUDE_API_KEY = "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"

## 1. Install Dependencies

In [None]:
!pip install --extra-index-url=https://pypi.celonis.cloud/ pycelonis -q
!pip install nbformat -q

# Install poppler for PDF processing (required on Colab)
import sys
if 'google.colab' in sys.modules:
    !apt-get update -qq
    !apt-get install -y -qq poppler-utils
    print("‚úÖ Poppler installed for PDF support!")

# Install packages
!pip install pillow pdf2image pandas tqdm torch transformers accelerate bitsandbytes qwen-vl-utils anthropic -q
print("‚úÖ All dependencies installed!")

## 2. Setup and Configuration

In [None]:
import os
import re
import json
import base64
from pathlib import Path
from typing import Dict, List
from io import BytesIO
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pdf2image import convert_from_path
import torch

# Auto-detect path for Colab or local
if 'google.colab' in sys.modules:
    if Path("/content/orbit_challenge/Invoices").exists():
        INVOICE_DIR = Path("/content/orbit_challenge/Invoices")
    else:
        INVOICE_DIR = Path("/content/Invoices")
else:
    INVOICE_DIR = Path("/Users/marvinschumann/orbit_challenge/Invoices")

REQUIRED_FIELDS = [
    "vendor_name",
    "vendor_address",
    "payment_terms",
    "invoice_value",
    "company_code",
    "po_reference",
    "invoice_id"
]

print(f"üìÇ Invoice directory: {INVOICE_DIR}")
print(f"‚úÖ Setup complete!")

## 3. Helper Functions

In [None]:
def load_invoice_pages(invoice_dir: Path, max_size=(1600, 1600)) -> List[Dict]:
    """Load all invoice pages as images"""
    pages = []
    files = sorted([p for p in invoice_dir.iterdir() if p.is_file()])
    
    for file_path in tqdm(files, desc="üìÇ Loading invoices"):
        suffix = file_path.suffix.lower()
        invoice_id = file_path.stem
        
        try:
            if suffix == ".pdf":
                images = convert_from_path(str(file_path), dpi=200, fmt="png")
                for idx, img in enumerate(images, start=1):
                    img = img.convert("RGB")
                    if img.width > max_size[0] or img.height > max_size[1]:
                        img.thumbnail(max_size, Image.Resampling.LANCZOS)
                    pages.append({
                        "invoice_id": invoice_id,
                        "page_index": idx,
                        "image": img,
                        "file_path": str(file_path),
                    })
            elif suffix in {".png", ".jpg", ".jpeg"}:
                img = Image.open(file_path).convert("RGB")
                if img.width > max_size[0] or img.height > max_size[1]:
                    img.thumbnail(max_size, Image.Resampling.LANCZOS)
                pages.append({
                    "invoice_id": invoice_id,
                    "page_index": 1,
                    "image": img,
                    "file_path": str(file_path),
                })
        except Exception as e:
            print(f"‚ùå Error loading {file_path.name}: {e}")
    
    return pages

def sanitize_po_reference(po_value: str) -> str:
    """Extract digits and zero-pad to 11 characters"""
    digits = re.sub(r"\D", "", po_value or "")
    return digits.zfill(11) if digits else "00000000000"

def is_extraction_complete(data: Dict) -> bool:
    """Check if extraction has all critical fields"""
    critical_fields = ["vendor_name", "invoice_value", "po_reference"]
    for field in critical_fields:
        value = data.get(field, "").strip()
        if not value or value == "00000000000":
            return False
    return True

print("‚úÖ Helper functions loaded!")

## 4. Phase 1: Try Open-Source First (Qwen2-VL)

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

print("="*70)
print("üöÄ PHASE 1: OPEN-SOURCE EXTRACTION (FREE)")
print("="*70)

# Load model
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"\nüì• Loading {MODEL_ID}...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto",
    load_in_4bit=True if DEVICE == "cuda" else False,
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

print(f"‚úÖ Model loaded on {DEVICE}")

# Extraction prompt
QWEN_PROMPT = """Extract invoice data and return ONLY valid JSON:

{
  "vendor_name": "company name",
  "vendor_address": "full address",
  "payment_terms": "payment terms",
  "invoice_value": "total amount with tax",
  "company_code": "company/customer code",
  "po_reference": "PO number (digits only)",
  "invoice_id": "invoice number"
}

Return ONLY the JSON, no markdown, no explanation."""

def extract_with_qwen(image: Image.Image, invoice_id: str) -> Dict:
    """Extract using Qwen2-VL"""
    try:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": QWEN_PROMPT},
            ],
        }]
        
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(DEVICE)
        
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
        
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        response = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0].strip()
        
        # Clean and parse JSON
        if response.startswith("```"):
            response = re.sub(r"```(?:json)?\n?", "", response).strip("`")
        
        json_match = re.search(r'\{.*\}', response, re.DOTALL)
        if json_match:
            response = json_match.group(0)
        
        data = json.loads(response)
        
        # Ensure all fields exist
        for field in REQUIRED_FIELDS:
            if field not in data:
                data[field] = ""
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return data
        
    except (torch.cuda.OutOfMemoryError, Exception) as e:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        return {field: "" for field in REQUIRED_FIELDS}

# Load invoices
invoice_pages = load_invoice_pages(INVOICE_DIR)
print(f"\n‚úÖ Loaded {len(invoice_pages)} page(s)\n")

# Extract with Qwen
qwen_results = []
failed_invoices = []

for page in invoice_pages:
    inv_id = page['invoice_id']
    print(f"üìÑ {inv_id} ({page['image'].width}x{page['image'].height})")
    
    result = extract_with_qwen(page['image'], inv_id)
    result['invoice_id'] = inv_id
    result['file_path'] = page['file_path']
    
    if is_extraction_complete(result):
        print(f"  ‚úÖ Success with Qwen2-VL")
        qwen_results.append(result)
    else:
        print(f"  ‚ö†Ô∏è  Failed - will retry with Claude API")
        failed_invoices.append(page)

print(f"\n{'='*70}")
print(f"‚úÖ Open-source: {len(qwen_results)}/{len(invoice_pages)} successful")
print(f"‚ö†Ô∏è  Need Claude API: {len(failed_invoices)} invoices")
print(f"{'='*70}")

## 5. Phase 2: Use Claude API for Failures

In [None]:
import anthropic

print("="*70)
print("üöÄ PHASE 2: CLAUDE API FOR FAILED INVOICES")
print("="*70)

if len(failed_invoices) == 0:
    print("\n‚úÖ No failures - all invoices extracted with open-source!")
    claude_results = []
else:
    print(f"\nüìã Processing {len(failed_invoices)} failed invoice(s) with Claude API...\n")
    
    client = anthropic.Anthropic(api_key=CLAUDE_API_KEY)
    
    CLAUDE_PROMPT = """Extract invoice information and return ONLY a valid JSON object:

{
  "vendor_name": "company providing goods/services",
  "vendor_address": "complete vendor address",
  "payment_terms": "payment terms and conditions",
  "invoice_value": "TOTAL amount INCLUDING VAT/tax with currency symbol",
  "company_code": "company code or customer code",
  "po_reference": "purchase order number (extract numeric digits)",
  "invoice_id": "invoice number"
}

CRITICAL: Return ONLY the JSON object, no markdown, no explanation."""
    
    claude_results = []
    
    for page in failed_invoices:
        inv_id = page['invoice_id']
        print(f"üìÑ {inv_id}")
        
        try:
            # Convert image to base64
            buffered = BytesIO()
            page['image'].save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            
            # Call Claude API
            message = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=1024,
                temperature=0.0,
                messages=[{
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": img_base64,
                            },
                        },
                        {"type": "text", "text": CLAUDE_PROMPT}
                    ],
                }],
            )
            
            response = message.content[0].text.strip()
            
            # Clean and parse
            if response.startswith("```"):
                response = re.sub(r"```(?:json)?\n?", "", response).strip("`")
            
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                response = json_match.group(0)
            
            data = json.loads(response)
            
            # Ensure all fields
            for field in REQUIRED_FIELDS:
                if field not in data:
                    data[field] = ""
            
            data['invoice_id'] = inv_id
            claude_results.append(data)
            
            print(f"  ‚úÖ Success with Claude API")
            
        except Exception as e:
            print(f"  ‚ùå Error: {str(e)[:100]}")
            claude_results.append({field: "" for field in REQUIRED_FIELDS})
    
    estimated_cost = len(failed_invoices) * 0.10
    print(f"\nüí∞ Estimated Claude API cost: ${estimated_cost:.2f}")

print(f"\n{'='*70}")
print(f"‚úÖ Claude API: {len([r for r in claude_results if is_extraction_complete(r)])}/{len(failed_invoices)} successful")
print(f"{'='*70}")

## 6. Merge Results and Create DataFrame

In [None]:
print("="*70)
print("üìä MERGING RESULTS")
print("="*70)

# Combine all results
all_results = qwen_results + claude_results

# Create DataFrame
records = []
for result in all_results:
    record = {}
    for field in REQUIRED_FIELDS:
        record[field] = str(result.get(field, "")).strip()
    
    # Sanitize PO reference
    record["po_reference"] = sanitize_po_reference(record["po_reference"])
    
    records.append(record)

df = pd.DataFrame(records, columns=REQUIRED_FIELDS)

print(f"\n‚úÖ Created DataFrame with {len(df)} invoices\n")
print("="*70)
print("üìã FINAL RESULTS")
print("="*70)
print(df.to_string(index=False))

# Detailed per invoice
print("\n" + "="*70)
print("üìä DETAILED RESULTS")
print("="*70)

for idx, row in df.iterrows():
    inv_id = row['invoice_id']
    # Determine source
    source = "Qwen2-VL" if any(r['invoice_id'] == inv_id for r in qwen_results) else "Claude API"
    
    print(f"\nüìÑ {inv_id} (source: {source}):")
    for field in REQUIRED_FIELDS:
        value = row[field]
        status = "‚úÖ" if value and value != "00000000000" else "‚ùå"
        print(f"  {status} {field}: {value if value else '(empty)'}")

# Summary
empty_count = (df == "").sum().sum() + (df == "00000000000").sum().sum()
total_fields = len(df) * len(REQUIRED_FIELDS)
completeness = ((total_fields - empty_count) / total_fields) * 100

print("\n" + "="*70)
print("üìà FINAL SUMMARY")
print("="*70)
print(f"\n‚úÖ Total invoices extracted: {len(df)}")
print(f"   - Open-source (Qwen2-VL): {len(qwen_results)} invoices ($0.00)")
print(f"   - Claude API: {len(claude_results)} invoices (${len(claude_results) * 0.10:.2f})")
print(f"\nüìä Completeness: {completeness:.1f}%")
print(f"   ({total_fields - empty_count}/{total_fields} fields filled)")

if empty_count == 0:
    print("\nüéâ Perfect! All fields extracted successfully!")

print("\n" + "="*70)
print("‚úÖ READY FOR PUSH.IPYNB")
print("="*70)

## 7. Push to Celonis

In [None]:
%run push.ipynb