<a href="https://colab.research.google.com/github/ocr-workspace/Ocr-Model-testing-on-Retail-Structured-Invoice-/blob/main/Structured(table)_Invoice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dependency install

In [None]:
!apt-get install tesseract-ocr -y
!pip install pytesseract
!pip install datasets
!pip install evaluate
!pip install opencv-python
!pip install pillow
!pip install pandas

Loading dataset named Cord from hugging face

In [None]:
from datasets import load_dataset

ds_invoice = load_dataset("naver-clova-ix/cord-v2")


**Clustering Module** for table reconstruction

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN

def cluster_rows(words, image_height):
    """
    words: list of dicts with keys:
           text, x_center, y_center, bbox
    image_height: height of invoice image

    Returns:
        list of row clusters (each cluster = list of words)
    """

    if len(words) == 0:
        return []

    # Extract Y centers
    y_coords = np.array([[w["y_center"]] for w in words])

    # Dynamic eps based on image height
    eps = image_height * 0.015  # 1.5% of height (tunable)

    clustering = DBSCAN(eps=eps, min_samples=1).fit(y_coords)

    labels = clustering.labels_

    row_clusters = {}

    for label, word in zip(labels, words):
        if label not in row_clusters:
            row_clusters[label] = []
        row_clusters[label].append(word)

    # Sort rows top-to-bottom
    sorted_rows = sorted(
        row_clusters.values(),
        key=lambda cluster: np.mean([w["y_center"] for w in cluster])
    )

    return sorted_rows

**Word Standardization Layer**

In [None]:
def standardize_ocr_output(ocr_result):

    words = []

    for item in ocr_result:
        x_min, y_min, x_max, y_max = item["bbox"]

        words.append({
            "text": item["text"],
            "x_center": (x_min + x_max) / 2,
            "y_center": (y_min + y_max) / 2,
            "bbox": [x_min, y_min, x_max, y_max]
        })

    return words

**Tesseract Wrapper**

In [None]:
import pytesseract
import cv2
import numpy as np

def tesseract_ocr(image):

    # Convert to RGB if needed
    if isinstance(image, np.ndarray):
        img = image
    else:
        img = np.array(image)

    data = pytesseract.image_to_data(
        img,
        output_type=pytesseract.Output.DICT
    )

    words = []

    n_boxes = len(data['text'])

    for i in range(n_boxes):

        text = data['text'][i].strip()

        if text == "":
            continue

        x = data['left'][i]
        y = data['top'][i]
        w = data['width'][i]
        h = data['height'][i]

        words.append({
            "text": text,
            "bbox": [x, y, x+w, y+h]
        })

    return words

**EasyOCR Wrapper**

In [None]:
!pip install easyocr

In [None]:
import easyocr

reader = easyocr.Reader(['en'], gpu=True)

def easyocr_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    results = reader.readtext(image)

    words = []

    for (bbox, text, conf) in results:

        x_coords = [pt[0] for pt in bbox]
        y_coords = [pt[1] for pt in bbox]

        x_min = int(min(x_coords))
        y_min = int(min(y_coords))
        x_max = int(max(x_coords))
        y_max = int(max(y_coords))

        words.append({
            "text": text,
            "bbox": [x_min, y_min, x_max, y_max]
        })

    return words

**PaddleOCR Wrapper**

In [None]:
!pip install paddlepaddle-gpu


In [None]:
# Force-uninstall the broken numpy
!pip uninstall numpy -y

# Install the packages but strictly lock numpy at 1.26.4 simultaneously
!pip install numpy==1.26.4 paddleocr==2.8.1 evaluate doctr

In [None]:
!pip list | grep -E "numpy|paddle|evaluate"

evaluate                                 0.4.6
numpy                                    1.26.4
paddleocr                                2.8.1
paddlepaddle-gpu                         2.6.2


In [None]:
# Uninstall langchain to bypass the PaddleX import bug
# This will NOT touch your numpy or paddlepaddle installations!
!pip uninstall langchain langchain-community langchain-core -y

In [None]:
import numpy as np

# 1. The Monkey Patch: Recreate the missing np.sctypes dictionary for imgaug
if not hasattr(np, 'sctypes'):
    np.sctypes = {
        'int': [np.int8, np.int16, np.int32, np.int64],
        'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
        'float': [np.float16, np.float32, np.float64],
        'complex': [np.complex64, np.complex128],
        'others': [bool, object, bytes, str, np.void]
    }

# 2. Now import PaddleOCR (it will load successfully because the crash is bypassed!)
from paddleocr import PaddleOCR

# 3. Initialize your model
paddle_model = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    use_gpu=True
)

print("PaddleOCR successfully loaded!")

def paddle_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = paddle_model.ocr(image, cls=True)

    words = []

    for line in result:
        for word_info in line:

            box = word_info[0]
            text = word_info[1][0]

            x_coords = [pt[0] for pt in box]
            y_coords = [pt[1] for pt in box]

            x_min = int(min(x_coords))
            y_min = int(min(y_coords))
            x_max = int(max(x_coords))
            y_max = int(max(y_coords))

            words.append({
                "text": text,
                "bbox": [x_min, y_min, x_max, y_max]
            })

    return words

**DocTR Wrapper**

In [None]:
!pip install python-doctr[torch]

In [None]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

doctr_model = ocr_predictor(pretrained=True)

def doctr_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = doctr_model([image])

    words = []

    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:

                    x_min, y_min = word.geometry[0]
                    x_max, y_max = word.geometry[1]

                    h, w = image.shape[:2]

                    words.append({
                        "text": word.value,
                        "bbox": [
                            int(x_min * w),
                            int(y_min * h),
                            int(x_max * w),
                            int(y_max * h)
                        ]
                    })

    return words

**Ground Truth Text Builder**

In [None]:
import json

def build_gt_text(sample):

    gt_outer = json.loads(sample["ground_truth"])
    parsed = gt_outer["gt_parse"]

    lines = []

    menu = parsed.get("menu", [])

    # Case 1: menu is list (normal case)
    if isinstance(menu, list):
        for item in menu:
            if isinstance(item, dict):
                cnt = item.get("cnt", "")
                name = item.get("nm", "")
                price = item.get("price", "")
                lines.append(f"{cnt} {name} {price}")

    # Case 2: menu is single dict (edge case)
    elif isinstance(menu, dict):
        cnt = menu.get("cnt", "")
        name = menu.get("nm", "")
        price = menu.get("price", "")
        lines.append(f"{cnt} {name} {price}")

    # Subtotal
    sub = parsed.get("sub_total", {})
    if isinstance(sub, dict):
        for value in sub.values():
            lines.append(str(value))

    # Total
    total = parsed.get("total", {})
    if isinstance(total, dict) and "total_price" in total:
        lines.append(str(total["total_price"]))

    return " ".join(lines)

**Universal Evalaution Runner**

In [None]:
!pip install jiwer

In [None]:
import evaluate
import time
import pandas as pd

cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

def run_ocr_benchmark(ds, ocr_function, model_name, samples=50):

    results = []

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]
        gt_text = build_gt_text(sample)

        start_time = time.time()

        words = ocr_function(image)

        if isinstance(words, str):
           ocr_text = words
        else:
           ocr_text = " ".join([w["text"] for w in words])

        end_time = time.time()

        cer = cer_metric.compute(
            predictions=[ocr_text],
            references=[gt_text]
        )

        wer = wer_metric.compute(
            predictions=[ocr_text],
            references=[gt_text]
        )

        results.append({
            "CER": cer,
            "WER": wer,
            "Time": end_time - start_time
        })

    df = pd.DataFrame(results)

    print(f"\n===== {model_name} Results =====")
    print("Mean CER:", df["CER"].mean())
    print("Mean WER:", df["WER"].mean())
    print("Avg Time:", df["Time"].mean())

    return df

**Tesseract**

In [None]:
for i in range(5):
    print(type(json.loads(ds_invoice["train"][i]["ground_truth"])["gt_parse"]["menu"]))

In [None]:
df_tess = run_ocr_benchmark(ds_invoice, tesseract_ocr, "Tesseract")


===== Tesseract Results =====
Mean CER: 1.2425900779068786
Mean WER: 1.663429996573475
Avg Time: 0.9068947362899781


**EasyOCR**

In [None]:
df_easy = run_ocr_benchmark(ds_invoice, easyocr_ocr, "EasyOCR")


===== EasyOCR Results =====
Mean CER: 1.1637943945368556
Mean WER: 1.7482964184519856
Avg Time: 0.5298595905303956


In [None]:
df_paddle = run_ocr_benchmark(ds_invoice, paddle_ocr, "PaddleOCR")

[2026/02/24 12:37:59] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.8221826553344727
[2026/02/24 12:37:59] ppocr DEBUG: cls num  : 80, elapsed : 0.2342057228088379
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 80, elapsed : 0.31259799003601074
[2026/02/24 12:38:00] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04682731628417969
[2026/02/24 12:38:00] ppocr DEBUG: cls num  : 29, elapsed : 0.08996272087097168
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 29, elapsed : 0.1291031837463379
[2026/02/24 12:38:00] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.0569760799407959
[2026/02/24 12:38:00] ppocr DEBUG: cls num  : 24, elapsed : 0.06496500968933105
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 24, elapsed : 0.05875110626220703
[2026/02/24 12:38:00] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.06055402755737305
[2026/02/24 12:38:00] ppocr DEBUG: cls num  : 20, elapsed : 0.03250241279602051
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 20, elapsed : 0.05735635757446289
[2026/02/24 

**PaddleOcr**

**Doctr**

In [None]:
df_doctr = run_ocr_benchmark(ds_invoice, doctr_ocr, "DocTR")


===== DocTR Results =====
Mean CER: 1.193762271922286
Mean WER: 1.407587275966102
Avg Time: 2.0541904258728025


**Paddle Ocr(detection) with TrOcr(recognition)**

In [None]:
!pip install transformers timm sentencepiece -q

In [None]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = TrOCRProcessor.from_pretrained(
    "microsoft/trocr-base-printed"
)

model_trocr = VisionEncoderDecoderModel.from_pretrained(
    "microsoft/trocr-base-printed"
).to(device)

model_trocr.eval()

In [None]:
from paddleocr import PaddleOCR

ocr_det = PaddleOCR(
    use_angle_cls=True,
    lang="en",
    rec=False,         # ‚ùó Detection only
    use_gpu=True
)

**Hybrid OCR Function**

In [None]:
import numpy as np
import cv2

def hybrid_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = ocr_det.ocr(image, cls=True)

    if result is None:
        return ""

    boxes = []
    for line in result:
        for word_info in line:
            boxes.append(word_info[0])

    # Sort top-to-bottom, left-to-right
    boxes = sorted(boxes, key=lambda x: (x[0][1], x[0][0]))

    recognized_words = []

    for box in boxes:

        pts = np.array(box).astype(int)

        x_min = max(0, min(pts[:,0]))
        y_min = max(0, min(pts[:,1]))
        x_max = min(image.shape[1], max(pts[:,0]))
        y_max = min(image.shape[0], max(pts[:,1]))

        crop = image[y_min:y_max, x_min:x_max]

        if crop.size == 0:
            continue

        # Resize for better transformer recognition
        crop = cv2.resize(crop, None, fx=2, fy=2)

        pixel_values = processor(
            images=crop,
            return_tensors="pt"
        ).pixel_values.to(device)

        with torch.no_grad():
            generated_ids = model_trocr.generate(pixel_values)

        text = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0]

        recognized_words.append(text)

    return " ".join(recognized_words)

In [None]:
df_hybrid = run_ocr_benchmark(ds_invoice, hybrid_ocr, "Hybrid")

[2026/02/24 12:54:54] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.06696462631225586
[2026/02/24 12:54:54] ppocr DEBUG: cls num  : 80, elapsed : 0.15387940406799316
[2026/02/24 12:54:54] ppocr DEBUG: rec_res num  : 80, elapsed : 0.2423720359802246
[2026/02/24 12:55:02] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04482865333557129
[2026/02/24 12:55:02] ppocr DEBUG: cls num  : 29, elapsed : 0.04314899444580078
[2026/02/24 12:55:02] ppocr DEBUG: rec_res num  : 29, elapsed : 0.07915234565734863
[2026/02/24 12:55:05] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.04288029670715332
[2026/02/24 12:55:05] ppocr DEBUG: cls num  : 24, elapsed : 0.06281566619873047
[2026/02/24 12:55:05] ppocr DEBUG: rec_res num  : 24, elapsed : 0.05519843101501465
[2026/02/24 12:55:08] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.04532909393310547
[2026/02/24 12:55:08] ppocr DEBUG: cls num  : 20, elapsed : 0.034276723861694336
[2026/02/24 12:55:08] ppocr DEBUG: rec_res num  : 20, elapsed : 0.0469670295715332
[2026/02/

**Donut**

In [None]:
!pip install transformers sentencepiece timm -q

Loading model

In [None]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = DonutProcessor.from_pretrained(
    "naver-clova-ix/donut-base-finetuned-cord-v2"
)

model_donut = VisionEncoderDecoderModel.from_pretrained(
    "naver-clova-ix/donut-base-finetuned-cord-v2"
).to(device)

model_donut.eval()

Donut Ocr function that returns structured json

In [None]:
import json

def donut_inference(image):

    if image.mode != "RGB":
        image = image.convert("RGB")

    # Prepare image
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

    # Prompt for CORD task
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(
        task_prompt,
        add_special_tokens=False,
        return_tensors="pt"
    ).input_ids.to(device)

    with torch.no_grad():
        outputs = model_donut.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=512,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
        )

    seq = processor.batch_decode(outputs, skip_special_tokens=True)[0]

    # Convert to JSON
    try:
        result = processor.token2json(seq)
    except:
        result = {}

    return result

Evalaution function for Donut

In [None]:
import json

def evaluate_donut(ds, samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        pred = donut_inference(image)

        # ---- FIXED HERE ----
        pred_parse = pred

        # ---- Total Accuracy ----
        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_match = int(gt_total == pred_total)
        total_correct += total_match

        # ---- Item Count Accuracy ----
        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_match = int(gt_items == pred_items)
        item_count_correct += item_match

    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
evaluate_donut(ds_invoice, samples=50)

Total Accuracy %: 100.0
Item Count Accuracy %: 100.0


**Testing robustness of Donut at diffrent noisy image**

Image Perturbation Function

In [None]:
import numpy as np
import cv2
from PIL import Image

def perturb_image(image, mode="noise"):

    img = np.array(image)

    if mode == "noise":
        noise = np.random.normal(0, 25, img.shape).astype(np.uint8)
        img = cv2.add(img, noise)

    elif mode == "blur":
        img = cv2.GaussianBlur(img, (7,7), 0)

    elif mode == "downscale":
        h, w = img.shape[:2]
        img = cv2.resize(img, (w//2, h//2))
        img = cv2.resize(img, (w, h))

    elif mode == "rotate":
        h, w = img.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), 5, 1)
        img = cv2.warpAffine(img, M, (w, h))

    elif mode == "brightness":
        img = cv2.convertScaleAbs(img, alpha=1.2, beta=40)

    return Image.fromarray(img)

> Robustness Evaluation

In [None]:
def evaluate_donut_robust(ds, mode="noise", samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        # Apply perturbation
        image = perturb_image(image, mode)

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        pred = donut_inference(image)
        pred_parse = pred

        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_correct += int(gt_total == pred_total)

        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_count_correct += int(gt_items == pred_items)

    print(f"=== Mode: {mode} ===")
    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
evaluate_donut_robust(ds_invoice, mode="noise")
evaluate_donut_robust(ds_invoice, mode="blur")
evaluate_donut_robust(ds_invoice, mode="downscale")
evaluate_donut_robust(ds_invoice, mode="rotate")
evaluate_donut_robust(ds_invoice, mode="brightness")

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== Mode: noise ===
Total Accuracy %: 34.0
Item Count Accuracy %: 40.0
=== Mode: blur ===
Total Accuracy %: 88.0
Item Count Accuracy %: 88.0
=== Mode: downscale ===
Total Accuracy %: 86.0
Item Count Accuracy %: 94.0
=== Mode: rotate ===
Total Accuracy %: 100.0
Item Count Accuracy %: 100.0
=== Mode: brightness ===
Total Accuracy %: 98.0
Item Count Accuracy %: 98.0


**Reconstruct text and then we check accuracy on diffrent matrix**

Reconstruction Function#

In [None]:
import re

def reconstruct_invoice_from_text(text):

    lines = text.split("\n")
    menu = []
    total_price = ""

    for line in lines:

        # Try menu row
        match = re.search(r'(\d+)\s*x\s*(.*?)\s+([\d,.-]+)', line)
        if match:
            cnt = match.group(1) + " x"
            name = match.group(2).strip()
            price = match.group(3)
            menu.append({
                "cnt": cnt,
                "nm": name,
                "price": price
            })

        # Try total detection
        if "total" in line.lower():
            price_match = re.search(r'([\d,.-]+)', line)
            if price_match:
                total_price = price_match.group(1)

    return {
        "menu": menu,
        "total": {"total_price": total_price}
    }

* Structured Evaluation for Any OCR

In [None]:
def evaluate_reconstruction(ds, ocr_function, samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        raw_output = ocr_function(image)

        # ---- UNIVERSAL NORMALIZATION ----
        if isinstance(raw_output, str):
            ocr_text = raw_output

        elif isinstance(raw_output, list):
            # case: list of dicts
            if len(raw_output) > 0 and isinstance(raw_output[0], dict):
                ocr_text = " ".join([w.get("text", "") for w in raw_output])
            else:
                ocr_text = " ".join([str(w) for w in raw_output])

        else:
            ocr_text = str(raw_output)

        pred_parse = reconstruct_invoice_from_text(ocr_text)

        # ---- Total Accuracy ----
        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_correct += int(gt_total == pred_total)

        # ---- Item Count Accuracy ----
        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_count_correct += int(gt_items == pred_items)

    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

8

In [None]:
import json

In [None]:
evaluate_reconstruction(ds_invoice, paddle_ocr)


[2026/02/25 04:29:54] ppocr DEBUG: dt_boxes num : 80, elapsed : 1.3662643432617188
[2026/02/25 04:29:55] ppocr DEBUG: cls num  : 80, elapsed : 0.22235393524169922
[2026/02/25 04:29:55] ppocr DEBUG: rec_res num  : 80, elapsed : 0.32889890670776367
[2026/02/25 04:29:55] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04947257041931152
[2026/02/25 04:29:55] ppocr DEBUG: cls num  : 29, elapsed : 0.09712028503417969
[2026/02/25 04:29:55] ppocr DEBUG: rec_res num  : 29, elapsed : 0.13600683212280273
[2026/02/25 04:29:55] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.05992603302001953
[2026/02/25 04:29:56] ppocr DEBUG: cls num  : 24, elapsed : 0.06924867630004883
[2026/02/25 04:29:56] ppocr DEBUG: rec_res num  : 24, elapsed : 0.0584104061126709
[2026/02/25 04:29:56] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.06280922889709473
[2026/02/25 04:29:56] ppocr DEBUG: cls num  : 20, elapsed : 0.034729957580566406
[2026/02/25 04:29:56] ppocr DEBUG: rec_res num  : 20, elapsed : 0.057021379470825195
[2026/02

In [None]:
evaluate_reconstruction(ds_invoice, tesseract_ocr)

Total Accuracy %: 4.0
Item Count Accuracy %: 0.0


In [None]:
evaluate_reconstruction(ds_invoice, easyocr_ocr)

Total Accuracy %: 2.0
Item Count Accuracy %: 0.0


In [None]:
evaluate_reconstruction(ds_invoice, doctr_ocr)

Total Accuracy %: 2.0
Item Count Accuracy %: 0.0


In [None]:


evaluate_reconstruction(ds_invoice, hybrid_ocr)

[2026/02/25 05:00:49] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.06450271606445312
[2026/02/25 05:00:49] ppocr DEBUG: cls num  : 80, elapsed : 0.17209267616271973
[2026/02/25 05:00:49] ppocr DEBUG: rec_res num  : 80, elapsed : 0.25741004943847656
[2026/02/25 05:00:58] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04198718070983887
[2026/02/25 05:00:58] ppocr DEBUG: cls num  : 29, elapsed : 0.03947925567626953
[2026/02/25 05:00:58] ppocr DEBUG: rec_res num  : 29, elapsed : 0.0767512321472168
[2026/02/25 05:01:01] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.03599214553833008
[2026/02/25 05:01:01] ppocr DEBUG: cls num  : 24, elapsed : 0.05639934539794922
[2026/02/25 05:01:01] ppocr DEBUG: rec_res num  : 24, elapsed : 0.054166316986083984
[2026/02/25 05:01:04] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.040987491607666016
[2026/02/25 05:01:04] ppocr DEBUG: cls num  : 20, elapsed : 0.032521724700927734
[2026/02/25 05:01:04] ppocr DEBUG: rec_res num  : 20, elapsed : 0.046328067779541016
[2026

**Function For getting structured output form paddleOcr**

In [None]:
def paddle_ocr_structured(image):
    # Fix: Convert the PIL Image from the dataset into a NumPy array
    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = paddle_model.ocr(image, cls=True)

    words = []

    for line in result:
        for word_info in line:
            box = word_info[0]
            text = word_info[1][0]

            x_min = min([pt[0] for pt in box])
            y_min = min([pt[1] for pt in box])

            words.append({
                "text": text,
                "x": x_min,
                "y": y_min
            })

    return words

**Row Clustering **

In [None]:
#clustring using y cordinate proximity
def cluster_rows(words):

    words = sorted(words, key=lambda w: w["y"])

    rows = []
    current_row = []

    if not words:
        return rows

    # Estimate average vertical spacing
    ys = sorted([w["y"] for w in words])
    avg_gap = sum(abs(ys[i] - ys[i-1]) for i in range(1, len(ys))) / max(1, len(ys)-1)

    y_threshold = avg_gap * 1.2  # adaptive

    for word in words:

        if not current_row:
            current_row.append(word)
            continue

        if abs(word["y"] - current_row[-1]["y"]) < y_threshold:
            current_row.append(word)
        else:
            rows.append(current_row)
            current_row = [word]

    if current_row:
        rows.append(current_row)

    return rows

**Reconstruction Function From  Spatial **

In [None]:
def reconstruct_from_spatial(words):

    if not words:
        return {"menu": [], "total": {"total_price": ""}}

    rows = cluster_rows(words)

    menu = []
    total_price = ""

    # Collect all X positions
    x_positions = sorted([w["x"] for w in words])
    max_x = max(x_positions)

    # Estimate price column dynamically (rightmost 20%)
    price_threshold = max_x * 0.6   # more relaxed

    for row in rows:

        row = sorted(row, key=lambda w: w["x"])

        qty = ""
        price = ""
        name_tokens = []

        for w in row:
            text = w["text"]

            # Detect quantity (digit near left)
            if text.isdigit() and w["x"] < max_x * 0.2:
                qty = text

            # Detect price (number containing comma or digit cluster near right)
            elif any(c.isdigit() for c in text) and w["x"] > price_threshold:
                price = text

            else:
                name_tokens.append(text)

        line_text = " ".join([w["text"] for w in row]).lower()

        # Detect total row
        if "total" in line_text and price:
            total_price = price
            continue

        # Detect menu row
        if price and name_tokens:
            if not qty:
                qty = "1"
            menu.append({
                "cnt": qty + " x",
                "nm": " ".join(name_tokens),
                "price": price
            })

    return {
        "menu": menu,
        "total": {"total_price": total_price}
    }

**Evalaution function for spatial construction**

In [None]:
def evaluate_spatial_paddle(ds, samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        words = paddle_ocr_structured(image)
        pred_parse = reconstruct_from_spatial(words)

        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_correct += int(gt_total == pred_total)

        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_count_correct += int(gt_items == pred_items)

    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
evaluate_spatial_paddle(ds_invoice, samples=50)

[2026/02/25 06:10:08] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.05966067314147949
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 80, elapsed : 0.11289453506469727
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 80, elapsed : 0.23305630683898926
[2026/02/25 06:10:09] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.03991222381591797
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 29, elapsed : 0.04390525817871094
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 29, elapsed : 0.07392764091491699
[2026/02/25 06:10:09] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.03527402877807617
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 24, elapsed : 0.03273439407348633
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 24, elapsed : 0.05725502967834473
[2026/02/25 06:10:09] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.0383145809173584
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 20, elapsed : 0.03424429893493652
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 20, elapsed : 0.049867868423461914
[2026/02