In [1]:
import torch
import xmltodict
import json
import time
from pprint import pprint
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
from pathlib import Path
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Model
MODEL_NAME = "naver-clova-ix/donut-base-finetuned-cord-v2"

print("Loading model... (this may take a while)")
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model loaded successfully on {device.upper()}\n")

Loading model... (this may take a while)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Model loaded successfully on CPU



In [3]:
# Helper Functions
def extract_receipt(image_path):
    """Run inference on a receipt image and return structured data."""
    image = Image.open(image_path).convert("RGB")

    decoder_input_ids = processor.tokenizer(
        "<s_cord-v2>", add_special_tokens=False
    ).input_ids
    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0).to(device)

    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

    start_time = time.time()
    generation_output = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )
    end_time = time.time()
    latency = round(end_time - start_time, 2)

    decoded_sequence = processor.batch_decode(generation_output.sequences)[0]
    decoded_sequence = decoded_sequence.replace(processor.tokenizer.eos_token, "")
    decoded_sequence = decoded_sequence.replace(processor.tokenizer.pad_token, "")
    decoded_sequence += "</s_cord-v2>"

    # Parse XML output safely
    try:
        xml_parsed = xmltodict.parse(decoded_sequence)
    except Exception as e:
        xml_parsed = {"error": str(e), "raw_output": decoded_sequence}

    # Convert to simplified JSON structure
    structured_data = simplify_receipt_data(xml_parsed)

    return structured_data, latency


def simplify_receipt_data(xml_dict):
    """Convert Donut's complex XML output into simplified JSON."""
    try:
        data = xml_dict["cord-v2"]["receipt"]
        items = data.get("items", {}).get("item", [])

        # Normalize item list
        if isinstance(items, dict):
            items = [items]

        item_list = []
        for item in items:
            item_list.append({
                "name": item.get("text", ""),
                "qty": item.get("count", "1"),
                "price": item.get("price", ""),
                "total": item.get("total_price", ""),
                "category": categorize_item(item.get("text", ""))
            })

        result = {
            "store": data.get("store_name", ""),
            "date": data.get("date", ""),
            "items": item_list,
            "subtotal": data.get("subtotal_price", ""),
            "tax": data.get("tax_price", ""),
            "total": data.get("total_price", "")
        }
        return result

    except Exception as e:
        return {"error": str(e), "raw_data": xml_dict}


def categorize_item(name):
    """Simple keyword-based categorization (for AI insight step)."""
    name_lower = name.lower()
    if any(k in name_lower for k in ["tea", "coffee", "drink", "milk", "water"]):
        return "Beverage"
    elif any(k in name_lower for k in ["noodle", "rice", "bread", "snack", "choco"]):
        return "Food"
    elif any(k in name_lower for k in ["soap", "shampoo", "toothpaste"]):
        return "Toiletries"
    elif any(k in name_lower for k in ["tax", "service"]):
        return "Service"
    else:
        return "Others"

In [4]:
# Run inference for 2 receipts

image_dir = Path("../data")
output_dir = Path("../data/outputs")
output_dir.mkdir(exist_ok=True)

results = []

for img_file in ["receipt1.jpg", "receipt2.jpg"]:
    print(f"Processing {img_file} ...")
    result, latency = extract_receipt(image_dir / img_file)
    results.append({"file": img_file, "data": result, "latency": latency})

    # Save JSON output
    with open(output_dir / f"{img_file.replace('.jpg', '_donut_output.json')}", "w") as f:
        json.dump(result, f, indent=2)

    print(f"Done in {latency}s. Saved JSON.\n")

Processing receipt1.jpg ...


FileNotFoundError: [Errno 2] No such file or directory: '../data/receipt1.jpg'