In [1]:
import torch
import xmltodict
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Model

MODEL_NAME = "naver-clova-ix/donut-base-finetuned-cord-v2"

print("Loading Donut model...")
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model loaded on {device.upper()}")

Loading Donut model...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Model loaded on CPU


In [3]:
# Helper Function
def read_receipt_with_donut(image_path: str):
    """Run Donut model inference on a receipt image and return structured JSON."""
    image = Image.open(image_path).convert("RGB")

    decoder_input_ids = processor.tokenizer("<s_cord-v2>", add_special_tokens=False).input_ids
    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)

    pixel_values = processor(image, return_tensors="pt").pixel_values

    # Run inference
    generation_output = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    decoded_sequence = processor.batch_decode(generation_output.sequences)[0]
    decoded_sequence = decoded_sequence.replace(processor.tokenizer.eos_token, "")
    decoded_sequence = decoded_sequence.replace(processor.tokenizer.pad_token, "")
    decoded_sequence += "</s_cord-v2>"

    # Parse Donut XML output
    my_dict = xmltodict.parse(decoded_sequence)
    data = my_dict.get("s_cord-v2", {})

    # Extract structured items
    items = []
    try:
        menu = data["s_menu"]
        names = menu["s_nm"]
        qtys = menu["s_cnt"]
        prices = menu["s_price"]

        # Pastikan semua jadi list
        if isinstance(names, str): names = [names]
        if isinstance(qtys, str): qtys = [qtys]
        if isinstance(prices, str): prices = [prices]

        for n, q, p in zip(names, qtys, prices):
            items.append({
                "name": n.strip(),
                "qty": int(q),
                "price": p.strip().replace(",", "")
            })
    except Exception as e:
        print("Parsing items failed:", e)

    result = {
        "store": None,
        "items": items,
        "subtotal": data.get("s_sub_total", {}).get("s_subtotal_price", ""),
        "total": data.get("s_total", {}).get("s_total_price", "")
    }
    return result

In [4]:
# Run Test on One Receipt
image_path = "/Users/mhdfarhanali/Documents/SmartSplitBill AI/modules/data/receipt1.jpg"
result = read_receipt_with_donut(image_path)

print("Parsed Receipt Result:")
pprint(result)

Parsed Receipt Result:
{'items': [{'name': 'Matcha Float', 'price': '23000', 'qty': 1},
           {'name': 'Red Velvet Ice', 'price': '20000', 'qty': 1},
           {'name': 'Coklat Float', 'price': '23000', 'qty': 1},
           {'name': 'Korean Strawberry', 'price': '25000', 'qty': 1},
           {'name': 'Americano Ice', 'price': '15000', 'qty': 1}],
 'store': None,
 'subtotal': '106,000',
 'total': '106,000'}
