<a href="https://colab.research.google.com/github/ocr-workspace/Ocr-Model-testing-on-Retail-Structured-Invoice-/blob/main/Structured_Invoice_Vast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dependency install

In [None]:
import os
# This hides the second GPU, preventing the DataParallel crash!
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
!pip install transformers==4.49.0

In [None]:
!apt-get install tesseract-ocr -y
!pip install pytesseract
!pip install datasets
!pip install evaluate
!pip install opencv-python
!pip install pillow
!pip install pandas

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


Loading dataset named Cord from hugging face

**Clustering Module** for table reconstruction

In [None]:
from datasets import load_dataset

ds_invoice = load_dataset("naver-clova-ix/cord-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import numpy as np
from sklearn.cluster import DBSCAN

def cluster_rows(words, image_height):
    """
    words: list of dicts with keys:
           text, x_center, y_center, bbox
    image_height: height of invoice image

    Returns:
        list of row clusters (each cluster = list of words)
    """

    if len(words) == 0:
        return []

    # Extract Y centers
    y_coords = np.array([[w["y_center"]] for w in words])

    # Dynamic eps based on image height
    eps = image_height * 0.015  # 1.5% of height (tunable)

    clustering = DBSCAN(eps=eps, min_samples=1).fit(y_coords)

    labels = clustering.labels_

    row_clusters = {}

    for label, word in zip(labels, words):
        if label not in row_clusters:
            row_clusters[label] = []
        row_clusters[label].append(word)

    # Sort rows top-to-bottom
    sorted_rows = sorted(
        row_clusters.values(),
        key=lambda cluster: np.mean([w["y_center"] for w in cluster])
    )

    return sorted_rows

**Word Standardization Layer**

In [None]:
def standardize_ocr_output(ocr_result):

    words = []

    for item in ocr_result:
        x_min, y_min, x_max, y_max = item["bbox"]

        words.append({
            "text": item["text"],
            "x_center": (x_min + x_max) / 2,
            "y_center": (y_min + y_max) / 2,
            "bbox": [x_min, y_min, x_max, y_max]
        })

    return words

**Tesseract Wrapper**

In [None]:
import pytesseract
import cv2
import numpy as np

def tesseract_ocr(image):

    # Convert to RGB if needed
    if isinstance(image, np.ndarray):
        img = image
    else:
        img = np.array(image)

    data = pytesseract.image_to_data(
        img,
        output_type=pytesseract.Output.DICT
    )

    words = []

    n_boxes = len(data['text'])

    for i in range(n_boxes):

        text = data['text'][i].strip()

        if text == "":
            continue

        x = data['left'][i]
        y = data['top'][i]
        w = data['width'][i]
        h = data['height'][i]

        words.append({
            "text": text,
            "bbox": [x, y, x+w, y+h]
        })

    return words

**EasyOCR Wrapper**

In [None]:
!pip install easyocr

In [None]:
import easyocr

reader = easyocr.Reader(['en'], gpu=True)

def easyocr_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    results = reader.readtext(image)

    words = []

    for (bbox, text, conf) in results:

        x_coords = [pt[0] for pt in bbox]
        y_coords = [pt[1] for pt in bbox]

        x_min = int(min(x_coords))
        y_min = int(min(y_coords))
        x_max = int(max(x_coords))
        y_max = int(max(y_coords))

        words.append({
            "text": text,
            "bbox": [x_min, y_min, x_max, y_max]
        })

    return words

**PaddleOCR Wrapper**

In [None]:
!pip install paddlepaddle-gpu


In [None]:
# Force-uninstall the broken numpy
!pip uninstall numpy -y

# Install the packages but strictly lock numpy at 1.26.4 simultaneously
!pip install numpy==1.26.4 paddleocr==2.8.1 evaluate doctr

In [None]:
!pip list | grep -E "numpy|paddle|evaluate"

evaluate                                 0.4.6
numpy                                    1.26.4
paddleocr                                2.8.1
paddlepaddle-gpu                         2.6.2


In [None]:
# Uninstall langchain to bypass the PaddleX import bug
# This will NOT touch your numpy or paddlepaddle installations!
!pip uninstall langchain langchain-community langchain-core -y

In [None]:
import numpy as np

# 1. The Monkey Patch: Recreate the missing np.sctypes dictionary for imgaug
if not hasattr(np, 'sctypes'):
    np.sctypes = {
        'int': [np.int8, np.int16, np.int32, np.int64],
        'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
        'float': [np.float16, np.float32, np.float64],
        'complex': [np.complex64, np.complex128],
        'others': [bool, object, bytes, str, np.void]
    }

# 2. Now import PaddleOCR (it will load successfully because the crash is bypassed!)
from paddleocr import PaddleOCR

# 3. Initialize your model
paddle_model = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    use_gpu=True
)

print("PaddleOCR successfully loaded!")

def paddle_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = paddle_model.ocr(image, cls=True)

    words = []

    for line in result:
        for word_info in line:

            box = word_info[0]
            text = word_info[1][0]

            x_coords = [pt[0] for pt in box]
            y_coords = [pt[1] for pt in box]

            x_min = int(min(x_coords))
            y_min = int(min(y_coords))
            x_max = int(max(x_coords))
            y_max = int(max(y_coords))

            words.append({
                "text": text,
                "bbox": [x_min, y_min, x_max, y_max]
            })

    return words

**DocTR Wrapper**

In [None]:
!pip install python-doctr[torch]

In [None]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

doctr_model = ocr_predictor(pretrained=True)

def doctr_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = doctr_model([image])

    words = []

    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:

                    x_min, y_min = word.geometry[0]
                    x_max, y_max = word.geometry[1]

                    h, w = image.shape[:2]

                    words.append({
                        "text": word.value,
                        "bbox": [
                            int(x_min * w),
                            int(y_min * h),
                            int(x_max * w),
                            int(y_max * h)
                        ]
                    })

    return words

Downloading https://doctr-static.mindee.com/models?id=v0.8.1/fast_base-688a8b34.pt&src=0 to /root/.cache/doctr/models/fast_base-688a8b34.pt


  0%|          | 0/65814772 [00:00<?, ?it/s]

Downloading https://doctr-static.mindee.com/models?id=v0.12.0/crnn_vgg16_bn-0417f351.pt&src=0 to /root/.cache/doctr/models/crnn_vgg16_bn-0417f351.pt


  0%|          | 0/63303144 [00:00<?, ?it/s]

**Ground Truth Text Builder**

In [None]:
import json

def build_gt_text(sample):

    gt_outer = json.loads(sample["ground_truth"])
    parsed = gt_outer["gt_parse"]

    lines = []

    menu = parsed.get("menu", [])

    # Case 1: menu is list (normal case)
    if isinstance(menu, list):
        for item in menu:
            if isinstance(item, dict):
                cnt = item.get("cnt", "")
                name = item.get("nm", "")
                price = item.get("price", "")
                lines.append(f"{cnt} {name} {price}")

    # Case 2: menu is single dict (edge case)
    elif isinstance(menu, dict):
        cnt = menu.get("cnt", "")
        name = menu.get("nm", "")
        price = menu.get("price", "")
        lines.append(f"{cnt} {name} {price}")

    # Subtotal
    sub = parsed.get("sub_total", {})
    if isinstance(sub, dict):
        for value in sub.values():
            lines.append(str(value))

    # Total
    total = parsed.get("total", {})
    if isinstance(total, dict) and "total_price" in total:
        lines.append(str(total["total_price"]))

    return " ".join(lines)

**Universal Evalaution Runner**

In [None]:
!pip install jiwer

In [None]:
import evaluate
import time
import pandas as pd

cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

def run_ocr_benchmark(ds, ocr_function, model_name, samples=50):

    results = []

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]
        gt_text = build_gt_text(sample)

        start_time = time.time()

        words = ocr_function(image)

        if isinstance(words, str):
           ocr_text = words
        else:
           ocr_text = " ".join([w["text"] for w in words])

        end_time = time.time()

        cer = cer_metric.compute(
            predictions=[ocr_text],
            references=[gt_text]
        )

        wer = wer_metric.compute(
            predictions=[ocr_text],
            references=[gt_text]
        )

        results.append({
            "CER": cer,
            "WER": wer,
            "Time": end_time - start_time
        })

    df = pd.DataFrame(results)

    print(f"\n===== {model_name} Results =====")
    print("Mean CER:", df["CER"].mean())
    print("Mean WER:", df["WER"].mean())
    print("Avg Time:", df["Time"].mean())

    return df

**Tesseract**

In [None]:
for i in range(5):
    print(type(json.loads(ds_invoice["train"][i]["ground_truth"])["gt_parse"]["menu"]))

In [None]:
df_tess = run_ocr_benchmark(ds_invoice, tesseract_ocr, "Tesseract")


===== Tesseract Results =====
Mean CER: 1.2425900779068786
Mean WER: 1.663429996573475
Avg Time: 0.9068947362899781


**EasyOCR**

In [None]:
df_easy = run_ocr_benchmark(ds_invoice, easyocr_ocr, "EasyOCR")


===== EasyOCR Results =====
Mean CER: 1.1637943945368556
Mean WER: 1.7482964184519856
Avg Time: 0.5298595905303956


In [None]:
df_paddle = run_ocr_benchmark(ds_invoice, paddle_ocr, "PaddleOCR")

[2026/02/24 12:37:59] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.8221826553344727
[2026/02/24 12:37:59] ppocr DEBUG: cls num  : 80, elapsed : 0.2342057228088379
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 80, elapsed : 0.31259799003601074
[2026/02/24 12:38:00] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04682731628417969
[2026/02/24 12:38:00] ppocr DEBUG: cls num  : 29, elapsed : 0.08996272087097168
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 29, elapsed : 0.1291031837463379
[2026/02/24 12:38:00] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.0569760799407959
[2026/02/24 12:38:00] ppocr DEBUG: cls num  : 24, elapsed : 0.06496500968933105
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 24, elapsed : 0.05875110626220703
[2026/02/24 12:38:00] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.06055402755737305
[2026/02/24 12:38:00] ppocr DEBUG: cls num  : 20, elapsed : 0.03250241279602051
[2026/02/24 12:38:00] ppocr DEBUG: rec_res num  : 20, elapsed : 0.05735635757446289
[2026/02/24 

**PaddleOcr**

**Doctr**

In [None]:
df_doctr = run_ocr_benchmark(ds_invoice, doctr_ocr, "DocTR")


===== DocTR Results =====
Mean CER: 1.193762271922286
Mean WER: 1.407587275966102
Avg Time: 2.0541904258728025


**Paddle Ocr(detection) with TrOcr(recognition)**

In [None]:
!pip install transformers timm sentencepiece -q

In [None]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = TrOCRProcessor.from_pretrained(
    "microsoft/trocr-base-printed"
)

model_trocr = VisionEncoderDecoderModel.from_pretrained(
    "microsoft/trocr-base-printed"
).to(device)

model_trocr.eval()

In [None]:
from paddleocr import PaddleOCR

ocr_det = PaddleOCR(
    use_angle_cls=True,
    lang="en",
    rec=False,         # ❗ Detection only
    use_gpu=True
)

**Hybrid OCR Function**

In [None]:
import numpy as np
import cv2

def hybrid_ocr(image):

    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = ocr_det.ocr(image, cls=True)

    if result is None:
        return ""

    boxes = []
    for line in result:
        for word_info in line:
            boxes.append(word_info[0])

    # Sort top-to-bottom, left-to-right
    boxes = sorted(boxes, key=lambda x: (x[0][1], x[0][0]))

    recognized_words = []

    for box in boxes:

        pts = np.array(box).astype(int)

        x_min = max(0, min(pts[:,0]))
        y_min = max(0, min(pts[:,1]))
        x_max = min(image.shape[1], max(pts[:,0]))
        y_max = min(image.shape[0], max(pts[:,1]))

        crop = image[y_min:y_max, x_min:x_max]

        if crop.size == 0:
            continue

        # Resize for better transformer recognition
        crop = cv2.resize(crop, None, fx=2, fy=2)

        pixel_values = processor(
            images=crop,
            return_tensors="pt"
        ).pixel_values.to(device)

        with torch.no_grad():
            generated_ids = model_trocr.generate(pixel_values)

        text = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0]

        recognized_words.append(text)

    return " ".join(recognized_words)

In [None]:
df_hybrid = run_ocr_benchmark(ds_invoice, hybrid_ocr, "Hybrid")

[2026/02/24 12:54:54] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.06696462631225586
[2026/02/24 12:54:54] ppocr DEBUG: cls num  : 80, elapsed : 0.15387940406799316
[2026/02/24 12:54:54] ppocr DEBUG: rec_res num  : 80, elapsed : 0.2423720359802246
[2026/02/24 12:55:02] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04482865333557129
[2026/02/24 12:55:02] ppocr DEBUG: cls num  : 29, elapsed : 0.04314899444580078
[2026/02/24 12:55:02] ppocr DEBUG: rec_res num  : 29, elapsed : 0.07915234565734863
[2026/02/24 12:55:05] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.04288029670715332
[2026/02/24 12:55:05] ppocr DEBUG: cls num  : 24, elapsed : 0.06281566619873047
[2026/02/24 12:55:05] ppocr DEBUG: rec_res num  : 24, elapsed : 0.05519843101501465
[2026/02/24 12:55:08] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.04532909393310547
[2026/02/24 12:55:08] ppocr DEBUG: cls num  : 20, elapsed : 0.034276723861694336
[2026/02/24 12:55:08] ppocr DEBUG: rec_res num  : 20, elapsed : 0.0469670295715332
[2026/02/

**Donut**

In [None]:
!pip install transformers sentencepiece timm -q

Loading model

In [None]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = DonutProcessor.from_pretrained(
    "naver-clova-ix/donut-base-finetuned-cord-v2"
)

model_donut = VisionEncoderDecoderModel.from_pretrained(
    "naver-clova-ix/donut-base-finetuned-cord-v2"
).to(device)

model_donut.eval()

Donut Ocr function that returns structured json

In [None]:
import json

def donut_inference(image):

    if image.mode != "RGB":
        image = image.convert("RGB")

    # Prepare image
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

    # Prompt for CORD task
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(
        task_prompt,
        add_special_tokens=False,
        return_tensors="pt"
    ).input_ids.to(device)

    with torch.no_grad():
        outputs = model_donut.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=512,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
        )

    seq = processor.batch_decode(outputs, skip_special_tokens=True)[0]

    # Convert to JSON
    try:
        result = processor.token2json(seq)
    except:
        result = {}

    return result

Evalaution function for Donut

In [None]:
import json

def evaluate_donut(ds, samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        pred = donut_inference(image)

        # ---- FIXED HERE ----
        pred_parse = pred

        # ---- Total Accuracy ----
        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_match = int(gt_total == pred_total)
        total_correct += total_match

        # ---- Item Count Accuracy ----
        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_match = int(gt_items == pred_items)
        item_count_correct += item_match

    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
evaluate_donut(ds_invoice, samples=50)

Total Accuracy %: 100.0
Item Count Accuracy %: 100.0


**Testing robustness of Donut at diffrent noisy image**

Image Perturbation Function

In [None]:
import numpy as np
import cv2
from PIL import Image

def perturb_image(image, mode="noise"):

    img = np.array(image)

    if mode == "noise":
        noise = np.random.normal(0, 25, img.shape).astype(np.uint8)
        img = cv2.add(img, noise)

    elif mode == "blur":
        img = cv2.GaussianBlur(img, (7,7), 0)

    elif mode == "downscale":
        h, w = img.shape[:2]
        img = cv2.resize(img, (w//2, h//2))
        img = cv2.resize(img, (w, h))

    elif mode == "rotate":
        h, w = img.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), 5, 1)
        img = cv2.warpAffine(img, M, (w, h))

    elif mode == "brightness":
        img = cv2.convertScaleAbs(img, alpha=1.2, beta=40)

    return Image.fromarray(img)

> Robustness Evaluation

In [None]:
def evaluate_donut_robust(ds, mode="noise", samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        # Apply perturbation
        image = perturb_image(image, mode)

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        pred = donut_inference(image)
        pred_parse = pred

        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_correct += int(gt_total == pred_total)

        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_count_correct += int(gt_items == pred_items)

    print(f"=== Mode: {mode} ===")
    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
evaluate_donut_robust(ds_invoice, mode="noise")
evaluate_donut_robust(ds_invoice, mode="blur")
evaluate_donut_robust(ds_invoice, mode="downscale")
evaluate_donut_robust(ds_invoice, mode="rotate")
evaluate_donut_robust(ds_invoice, mode="brightness")

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== Mode: noise ===
Total Accuracy %: 34.0
Item Count Accuracy %: 40.0
=== Mode: blur ===
Total Accuracy %: 88.0
Item Count Accuracy %: 88.0
=== Mode: downscale ===
Total Accuracy %: 86.0
Item Count Accuracy %: 94.0
=== Mode: rotate ===
Total Accuracy %: 100.0
Item Count Accuracy %: 100.0
=== Mode: brightness ===
Total Accuracy %: 98.0
Item Count Accuracy %: 98.0


**Reconstruct text and then we check accuracy on diffrent matrix**

Reconstruction Function#

In [None]:
import re

def reconstruct_invoice_from_text(text):

    lines = text.split("\n")
    menu = []
    total_price = ""

    for line in lines:

        # Try menu row
        match = re.search(r'(\d+)\s*x\s*(.*?)\s+([\d,.-]+)', line)
        if match:
            cnt = match.group(1) + " x"
            name = match.group(2).strip()
            price = match.group(3)
            menu.append({
                "cnt": cnt,
                "nm": name,
                "price": price
            })

        # Try total detection
        if "total" in line.lower():
            price_match = re.search(r'([\d,.-]+)', line)
            if price_match:
                total_price = price_match.group(1)

    return {
        "menu": menu,
        "total": {"total_price": total_price}
    }

* Structured Evaluation for Any OCR

In [None]:
def evaluate_reconstruction(ds, ocr_function, samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        raw_output = ocr_function(image)

        # ---- UNIVERSAL NORMALIZATION ----
        if isinstance(raw_output, str):
            ocr_text = raw_output

        elif isinstance(raw_output, list):
            # case: list of dicts
            if len(raw_output) > 0 and isinstance(raw_output[0], dict):
                ocr_text = " ".join([w.get("text", "") for w in raw_output])
            else:
                ocr_text = " ".join([str(w) for w in raw_output])

        else:
            ocr_text = str(raw_output)

        pred_parse = reconstruct_invoice_from_text(ocr_text)

        # ---- Total Accuracy ----
        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_correct += int(gt_total == pred_total)

        # ---- Item Count Accuracy ----
        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_count_correct += int(gt_items == pred_items)

    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

8

In [None]:
import json

In [None]:
evaluate_reconstruction(ds_invoice, paddle_ocr)


[2026/02/25 04:29:54] ppocr DEBUG: dt_boxes num : 80, elapsed : 1.3662643432617188
[2026/02/25 04:29:55] ppocr DEBUG: cls num  : 80, elapsed : 0.22235393524169922
[2026/02/25 04:29:55] ppocr DEBUG: rec_res num  : 80, elapsed : 0.32889890670776367
[2026/02/25 04:29:55] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04947257041931152
[2026/02/25 04:29:55] ppocr DEBUG: cls num  : 29, elapsed : 0.09712028503417969
[2026/02/25 04:29:55] ppocr DEBUG: rec_res num  : 29, elapsed : 0.13600683212280273
[2026/02/25 04:29:55] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.05992603302001953
[2026/02/25 04:29:56] ppocr DEBUG: cls num  : 24, elapsed : 0.06924867630004883
[2026/02/25 04:29:56] ppocr DEBUG: rec_res num  : 24, elapsed : 0.0584104061126709
[2026/02/25 04:29:56] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.06280922889709473
[2026/02/25 04:29:56] ppocr DEBUG: cls num  : 20, elapsed : 0.034729957580566406
[2026/02/25 04:29:56] ppocr DEBUG: rec_res num  : 20, elapsed : 0.057021379470825195
[2026/02

In [None]:
evaluate_reconstruction(ds_invoice, tesseract_ocr)

Total Accuracy %: 4.0
Item Count Accuracy %: 0.0


In [None]:
evaluate_reconstruction(ds_invoice, easyocr_ocr)

Total Accuracy %: 2.0
Item Count Accuracy %: 0.0


In [None]:
evaluate_reconstruction(ds_invoice, doctr_ocr)

Total Accuracy %: 2.0
Item Count Accuracy %: 0.0


In [None]:


evaluate_reconstruction(ds_invoice, hybrid_ocr)

[2026/02/25 05:00:49] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.06450271606445312
[2026/02/25 05:00:49] ppocr DEBUG: cls num  : 80, elapsed : 0.17209267616271973
[2026/02/25 05:00:49] ppocr DEBUG: rec_res num  : 80, elapsed : 0.25741004943847656
[2026/02/25 05:00:58] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.04198718070983887
[2026/02/25 05:00:58] ppocr DEBUG: cls num  : 29, elapsed : 0.03947925567626953
[2026/02/25 05:00:58] ppocr DEBUG: rec_res num  : 29, elapsed : 0.0767512321472168
[2026/02/25 05:01:01] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.03599214553833008
[2026/02/25 05:01:01] ppocr DEBUG: cls num  : 24, elapsed : 0.05639934539794922
[2026/02/25 05:01:01] ppocr DEBUG: rec_res num  : 24, elapsed : 0.054166316986083984
[2026/02/25 05:01:04] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.040987491607666016
[2026/02/25 05:01:04] ppocr DEBUG: cls num  : 20, elapsed : 0.032521724700927734
[2026/02/25 05:01:04] ppocr DEBUG: rec_res num  : 20, elapsed : 0.046328067779541016
[2026

**Function For getting structured output form paddleOcr**

In [None]:
def paddle_ocr_structured(image):
    # Fix: Convert the PIL Image from the dataset into a NumPy array
    if not isinstance(image, np.ndarray):
        image = np.array(image)

    result = paddle_model.ocr(image, cls=True)

    words = []

    for line in result:
        for word_info in line:
            box = word_info[0]
            text = word_info[1][0]

            x_min = min([pt[0] for pt in box])
            y_min = min([pt[1] for pt in box])

            words.append({
                "text": text,
                "x": x_min,
                "y": y_min
            })

    return words

**Row Clustering **

In [None]:
#clustring using y cordinate proximity
def cluster_rows(words):

    words = sorted(words, key=lambda w: w["y"])

    rows = []
    current_row = []

    if not words:
        return rows

    # Estimate average vertical spacing
    ys = sorted([w["y"] for w in words])
    avg_gap = sum(abs(ys[i] - ys[i-1]) for i in range(1, len(ys))) / max(1, len(ys)-1)

    y_threshold = avg_gap * 1.2  # adaptive

    for word in words:

        if not current_row:
            current_row.append(word)
            continue

        if abs(word["y"] - current_row[-1]["y"]) < y_threshold:
            current_row.append(word)
        else:
            rows.append(current_row)
            current_row = [word]

    if current_row:
        rows.append(current_row)

    return rows

**Reconstruction Function From  Spatial **

In [None]:
def reconstruct_from_spatial(words):

    if not words:
        return {"menu": [], "total": {"total_price": ""}}

    rows = cluster_rows(words)

    menu = []
    total_price = ""

    # Collect all X positions
    x_positions = sorted([w["x"] for w in words])
    max_x = max(x_positions)

    # Estimate price column dynamically (rightmost 20%)
    price_threshold = max_x * 0.6   # more relaxed

    for row in rows:

        row = sorted(row, key=lambda w: w["x"])

        qty = ""
        price = ""
        name_tokens = []

        for w in row:
            text = w["text"]

            # Detect quantity (digit near left)
            if text.isdigit() and w["x"] < max_x * 0.2:
                qty = text

            # Detect price (number containing comma or digit cluster near right)
            elif any(c.isdigit() for c in text) and w["x"] > price_threshold:
                price = text

            else:
                name_tokens.append(text)

        line_text = " ".join([w["text"] for w in row]).lower()

        # Detect total row
        if "total" in line_text and price:
            total_price = price
            continue

        # Detect menu row
        if price and name_tokens:
            if not qty:
                qty = "1"
            menu.append({
                "cnt": qty + " x",
                "nm": " ".join(name_tokens),
                "price": price
            })

    return {
        "menu": menu,
        "total": {"total_price": total_price}
    }

**Evalaution function for spatial construction from Qwen LLm**

In [None]:
def evaluate_spatial_paddle(ds, samples=50):

    total_correct = 0
    item_count_correct = 0

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]

        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        words = paddle_ocr_structured(image)
        pred_parse = reconstruct_from_spatial(words)

        gt_total = gt_parse.get("total", {}).get("total_price", "")
        pred_total = pred_parse.get("total", {}).get("total_price", "")

        total_correct += int(gt_total == pred_total)

        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))

        item_count_correct += int(gt_items == pred_items)

    print("Total Accuracy %:", total_correct/samples*100)
    print("Item Count Accuracy %:", item_count_correct/samples*100)

In [None]:
evaluate_spatial_paddle(ds_invoice, samples=50)

[2026/02/25 06:10:08] ppocr DEBUG: dt_boxes num : 80, elapsed : 0.05966067314147949
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 80, elapsed : 0.11289453506469727
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 80, elapsed : 0.23305630683898926
[2026/02/25 06:10:09] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.03991222381591797
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 29, elapsed : 0.04390525817871094
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 29, elapsed : 0.07392764091491699
[2026/02/25 06:10:09] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.03527402877807617
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 24, elapsed : 0.03273439407348633
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 24, elapsed : 0.05725502967834473
[2026/02/25 06:10:09] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.0383145809173584
[2026/02/25 06:10:09] ppocr DEBUG: cls num  : 20, elapsed : 0.03424429893493652
[2026/02/25 06:10:09] ppocr DEBUG: rec_res num  : 20, elapsed : 0.049867868423461914
[2026/02

** Florence-2 Finetunning **

In [None]:
import os
import json
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoProcessor,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)

# ---------------------------------------------------
# Optional: helps debugging CUDA crashes
# Remove after everything works
# ---------------------------------------------------
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print("1. Initializing Processor...")

processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-base",
    trust_remote_code=True
)

# ---------------------------------------------------
# DATASET
# ---------------------------------------------------
class RobustInvoiceDataset(Dataset):
    def __init__(self, hf_dataset, processor, split="train"):
        self.dataset = hf_dataset[split]
        self.processor = processor

        # IMPORTANT:
        # Florence expects task tokens.
        # Do NOT use free-form natural text here.
        self.task_prompt = "<ocr>"

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]

        image = sample["image"].convert("RGB")

        gt_json = json.loads(sample["ground_truth"])

        target_text = json.dumps({
            "menu": gt_json["gt_parse"].get("menu", []),
            "total": gt_json["gt_parse"].get("total", {})
        })

        # ---------------------------------------------------
        # DO NOT force padding or truncation here
        # Let collate_fn handle padding dynamically
        # ---------------------------------------------------
        inputs = self.processor(
            text=self.task_prompt,
            images=image,
            return_tensors="pt"
        )

        labels = processor.tokenizer(
            target_text,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0)
        }


train_dataset = RobustInvoiceDataset(ds_invoice, processor, split="train")
print(f"Factory initialized! Ready to process {len(train_dataset)} invoices.")

# ---------------------------------------------------
# MODEL
# ---------------------------------------------------
print("2. Loading the Florence-2 Brain...")

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-base",
    trust_remote_code=True
)

# DO NOT resize embeddings
# DO NOT modify vocab
# Florence already matches tokenizer internally

# ---------------------------------------------------
# Sanity Checks
# ---------------------------------------------------
print("Tokenizer vocab size:", processor.tokenizer.vocab_size)
print("Model vocab size:", model.config.vocab_size)

# ---------------------------------------------------
# COLLATE FUNCTION (Dynamic Padding)
# ---------------------------------------------------
def custom_collate_fn(batch):

    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch],
        batch_first=True,
        padding_value=processor.tokenizer.pad_token_id
    )

    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch],
        batch_first=True,
        padding_value=0
    )

    pixel_values = torch.stack([item["pixel_values"] for item in batch])

    labels = torch.nn.utils.rnn.pad_sequence(
        [item["labels"] for item in batch],
        batch_first=True,
        padding_value=processor.tokenizer.pad_token_id
    )

    # Ignore padding tokens in loss
    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values,
        "labels": labels
    }


print("3. Configuring the Kaggle Cockpit...")

training_args = TrainingArguments(
    output_dir="/kaggle/working/florence2-invoice-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    remove_unused_columns=False,  # REQUIRED for multimodal models
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=custom_collate_fn,
)

print("\n🚀 IGNITION: Starting the fine-tuning process!")
trainer.train()

print("\nTraining Complete! Saving final weights...")
trainer.save_model("/kaggle/working/best_florence2_model")
print("Model saved successfully. Pipeline finished.")

1. Initializing Processor...
Factory initialized! Ready to process 800 invoices.
2. Loading the Florence-2 Brain...
Tokenizer vocab size: 50265
Model vocab size: 51289
3. Configuring the Kaggle Cockpit...

🚀 IGNITION: Starting the fine-tuning process!


Step,Training Loss
10,3.1049
20,1.4785
30,0.8408
40,0.646
50,0.4351
60,0.4441
70,0.3515
80,0.3665
90,0.2619
100,0.2577



Training Complete! Saving final weights...
Model saved successfully. Pipeline finished.


In [None]:
import json
import torch
import difflib
import re
from tqdm import tqdm
from transformers import AutoProcessor, AutoModelForCausalLM

# ==========================================
# 1. SETUP & HELPER FUNCTIONS
# ==========================================

print("1. Loading the Custom Brain...")
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
model_path = "/kaggle/working/best_florence2_model"
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# The exact list of coordinate/metadata keys we MUST ignore
IGNORE_KEYS = {
    'version', 'split', 'image_id', 'width', 'height',
    'x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4',
    'is_key', 'row_id', 'text', 'category', 'group_id', 'sub_group_id'
}

def get_similarity(str1, str2):
    """Returns a similarity score between 0 and 1 for two strings."""
    return difflib.SequenceMatcher(None, str(str1).lower(), str(str2).lower()).ratio()

def flatten_ground_truth(data):
    """
    Recursively extracts all string key-value pairs from nested ground truth,
    while explicitly filtering out OCR bounding boxes and metadata.
    """
    items = []
    if isinstance(data, dict):
        for k, v in data.items():
            if k in IGNORE_KEYS:
                continue # Skip the junk data!

            if isinstance(v, (dict, list)):
                items.extend(flatten_ground_truth(v))
            else:
                items.append((str(k).strip(), str(v).strip()))
    elif isinstance(data, list):
        for item in data:
            items.extend(flatten_ground_truth(item))
    return items

def salvage_predictions(text):
    """
    Uses regex to aggressively scrape key-value pairs from broken JSON strings.
    Handles keys with spaces, parentheses, and symbols.
    """
    matches = []

    # Match quoted string pairs: "Any Key" : "Any Value"
    matches.extend(re.findall(r'"([^"]+)"\s*:\s*"([^"]*)"', text))

    # Match unquoted number pairs: "Any Key" : 1234.50
    matches.extend(re.findall(r'"([^"]+)"\s*:\s*([0-9\.\,-]+)(?=[,\}\s])', text))

    # Clean up whitespace and ignore empty keys
    clean_matches = [(k.strip(), v.strip()) for k, v in matches if k.strip()]
    return clean_matches

def evaluate_flat_lists(pred_list, true_list, threshold=0.85):
    """Compares the salvaged predictions against ground truth, handling duplicates gracefully."""
    tp, fp, fn = 0, 0, 0
    pred_pool = pred_list.copy()

    for true_k, true_v in true_list:
        best_match_idx = -1
        best_sim = 0

        # Find the best matching prediction for this specific true key
        for i, (pred_k, pred_v) in enumerate(pred_pool):
            # Relaxed key matching: case-insensitive
            if true_k.lower() == pred_k.lower():
                sim = get_similarity(pred_v, true_v)
                if sim > best_sim:
                    best_sim = sim
                    best_match_idx = i

        # If we found a good fuzzy match, count as True Positive
        if best_sim >= threshold:
            tp += 1
            pred_pool.pop(best_match_idx) # Remove to avoid double counting
        else:
            fn += 1 # We missed this ground truth field

    # Anything left in the prediction pool is considered a hallucinated extra field
    fp = len(pred_pool)
    return tp, fp, fn

# ==========================================
# 2. THE EVALUATION LOOP
# ==========================================

print("\n2. Starting Flat-Entity Bulk Evaluation (Cleaned Ground Truth)...")

task_prompt = "Extract Structured Invoice Data"

total_tp, total_fp, total_fn = 0, 0, 0
perfect_documents = 0

# Note: Assumes `ds_invoice` and `perturb_image` are already defined in your notebook
num_samples = min(50, len(ds_invoice["test"]))

for i in tqdm(range(num_samples), desc="Evaluating Florence-2"):
    sample = ds_invoice["test"][i]

    # Prepare image and ground truth
    clean_image = sample["image"].convert("RGB")
    messy_image = perturb_image(clean_image, mode="blur") # Your heavy blur!

    # Prepare Cleaned Ground Truth as a flat list
    gt_data = sample["ground_truth"]
    true_dict = json.loads(gt_data) if isinstance(gt_data, str) else gt_data
    true_list = flatten_ground_truth(true_dict)

    # Run Inference
    inputs = processor(text=task_prompt, images=messy_image, return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            early_stopping=False,
            do_sample=False,
            num_beams=3,
        )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    final_output_string = generated_text.replace(task_prompt, "").strip()

    # Salvage Data using the relaxed Regex
    pred_list = salvage_predictions(final_output_string)

    # Evaluate the flat lists
    tp, fp, fn = evaluate_flat_lists(pred_list, true_list)

    total_tp += tp
    total_fp += fp
    total_fn += fn

    # Check if the document was perfectly parsed
    if fn == 0 and fp == 0 and tp == len(true_list) and len(true_list) > 0:
        perfect_documents += 1

# ==========================================
# 3. CALCULATING FINAL RESEARCH METRICS
# ==========================================

precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

field_accuracy = total_tp / (total_tp + total_fp + total_fn) if (total_tp + total_fp + total_fn) > 0 else 0
document_accuracy = perfect_documents / num_samples if num_samples > 0 else 0

print("\n" + "="*50)
print("📊 FINAL FLAT-ENTITY METRICS (Florence-2)")
print("="*50)
print(f"Total Samples Tested : {num_samples}")
print("-" * 50)
print(f"Precision            : {precision:.4f}")
print(f"Recall               : {recall:.4f}")
print(f"F1 Score             : {f1_score:.4f}")
print("-" * 50)
print(f"Total Field Accuracy : {field_accuracy:.4f}")
print(f"Document Accuracy    : {document_accuracy:.4f}")
print("="*50)

1. Loading the Custom Brain...

2. Starting Flat-Entity Bulk Evaluation (Cleaned Ground Truth)...


Evaluating Florence-2: 100%|██████████| 50/50 [01:10<00:00,  1.41s/it]


📊 FINAL FLAT-ENTITY METRICS (Florence-2)
Total Samples Tested : 50
--------------------------------------------------
Precision            : 0.6159
Recall               : 0.2773
F1 Score             : 0.3824
--------------------------------------------------
Total Field Accuracy : 0.2364
Document Accuracy    : 0.0000





* Conclusion:Evaluation on heavily perturbed invoices reveals that the fine-tuned Florence-2 model adopts a conservative extraction strategy. Rather than hallucinating incorrect values when blinded by severe blur, the model safely drops unreadable fields—resulting in a low recall of 0.2773, but successfully preserving a strong precision of 0.6159 for the data it can successfully resolve.

**Qwen2-VL-7B Local LLm**

In [None]:
!pip install accelerate bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.2


In [None]:
import torch
import json
import re
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# ==========================================
# 1. LOAD QWEN-2.5-7B IN 4-BIT
# ==========================================
print("Loading Qwen-2.5-7B-Instruct (4-bit)...")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model_id = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# device_map="auto" automatically manages your GPU memory
qwen_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)
qwen_model.eval()

# ==========================================
# 2. PIPELINE FUNCTIONS
# ==========================================
def serialize_ocr_for_llm(ocr_words):
    """Converts PaddleOCR spatial output into a readable string format for Qwen."""
    # Sort top-to-bottom, then left-to-right (using a y-tolerance of 10 pixels for rows)
    sorted_words = sorted(ocr_words, key=lambda w: (w['y'] // 10, w['x']))

    text_representation = ""
    for w in sorted_words:
        # Pass the spatial coordinates and the text to the LLM
        text_representation += f"[x:{w['x']}, y:{w['y']}] {w['text']}\n"
    return text_representation

def extract_json_with_qwen(ocr_words):
    """Passes serialized OCR to Qwen and forces a strict JSON output."""
    if not ocr_words:
        return {"menu": [], "total": {"total_price": ""}}

    ocr_text = serialize_ocr_for_llm(ocr_words)

    # Strict Prompt Engineering
    system_prompt = """You are an expert Data Extraction AI.
You are given raw OCR text from a receipt along with their [x, y] pixel coordinates.
Reconstruct the menu items and find the final total price.
You MUST respond with ONLY raw, valid JSON. Do not include markdown formatting like ```json.
Schema:
{
  "menu": [
    {"cnt": "quantity", "nm": "item name", "price": "price"}
  ],
  "total": {
    "total_price": "total amount"
  }
}"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Extract data from this OCR output:\n{ocr_text}"}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to("cuda")

    with torch.no_grad():
        generated_ids = qwen_model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.1, # Low temp = highly factual, less creative hallucination
            do_sample=False
        )

    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Clean the output (LLMs sometimes ignore the "no markdown" rule)
    clean_response = response.strip()
    if clean_response.startswith("```json"):
        clean_response = clean_response.split("```json")[1]
    if clean_response.startswith("```"):
        clean_response = clean_response.split("```")[1]
    if clean_response.endswith("```"):
        clean_response = clean_response.rsplit("```", 1)[0]
    clean_response = clean_response.strip()

    try:
        return json.loads(clean_response)
    except json.JSONDecodeError as e:
        # Fallback if the LLM output is structurally broken
        return {"menu": [], "total": {"total_price": ""}}

# ==========================================
# 3. EVALUATION LOOP
# ==========================================
def evaluate_phase1_pipeline(ds, samples=50):
    print(f"\nStarting Phase 1 Evaluation (PaddleOCR -> Qwen-2.5-7B) on {samples} samples...")

    total_correct = 0
    item_count_correct = 0

    for i in tqdm(range(samples), desc="Processing Invoices"):
        sample = ds["train"][i]
        image = sample["image"]

        # Ground Truth
        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        # 1. Run PaddleOCR (using your existing function)
        words = paddle_ocr_structured(image)

        # 2. Run Qwen Extraction
        pred_parse = extract_json_with_qwen(words)

        # Metrics: Total Accuracy
        gt_total = str(gt_parse.get("total", {}).get("total_price", "")).strip()
        pred_total = str(pred_parse.get("total", {}).get("total_price", "")).strip()
        if gt_total == pred_total:
            total_correct += 1

        # Metrics: Item Count Accuracy
        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))
        if gt_items == pred_items:
            item_count_correct += 1

    print("\n" + "="*40)
    print("📊 PHASE 1 PIPELINE RESULTS")
    print("="*40)
    print(f"Total Accuracy %       : {(total_correct/samples)*100:.2f}%")
    print(f"Item Count Accuracy %  : {(item_count_correct/samples)*100:.2f}%")
    print("="*40)

# RUN THE EVALUATION
evaluate_phase1_pipeline(ds_invoice, samples=50)

Loading Qwen-2.5-7B-Instruct (4-bit)...


Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]


Starting Phase 1 Evaluation (PaddleOCR -> Qwen-2.5-7B) on 50 samples...


Processing Invoices:   0%|          | 0/50 [00:00<?, ?it/s]

[2026/02/27 12:32:35] ppocr DEBUG: dt_boxes num : 80, elapsed : 1.8796188831329346
[2026/02/27 12:32:36] ppocr DEBUG: cls num  : 80, elapsed : 0.32678914070129395
[2026/02/27 12:32:36] ppocr DEBUG: rec_res num  : 80, elapsed : 0.4437994956970215


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing Invoices:   2%|▏         | 1/50 [00:43<35:41, 43.70s/it]

[2026/02/27 12:33:17] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.05317807197570801
[2026/02/27 12:33:17] ppocr DEBUG: cls num  : 29, elapsed : 0.1059122085571289
[2026/02/27 12:33:17] ppocr DEBUG: rec_res num  : 29, elapsed : 0.17212271690368652


Processing Invoices:   4%|▍         | 2/50 [01:05<24:29, 30.61s/it]

[2026/02/27 12:33:38] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.06172347068786621
[2026/02/27 12:33:39] ppocr DEBUG: cls num  : 24, elapsed : 0.0815591812133789
[2026/02/27 12:33:39] ppocr DEBUG: rec_res num  : 24, elapsed : 0.08066320419311523


Processing Invoices:   6%|▌         | 3/50 [01:26<20:40, 26.38s/it]

[2026/02/27 12:34:00] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.0706629753112793
[2026/02/27 12:34:00] ppocr DEBUG: cls num  : 20, elapsed : 0.03432130813598633
[2026/02/27 12:34:00] ppocr DEBUG: rec_res num  : 20, elapsed : 0.07080960273742676


Processing Invoices:   8%|▊         | 4/50 [01:40<16:34, 21.61s/it]

[2026/02/27 12:34:14] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.03534269332885742
[2026/02/27 12:34:14] ppocr DEBUG: cls num  : 13, elapsed : 0.04884815216064453
[2026/02/27 12:34:14] ppocr DEBUG: rec_res num  : 13, elapsed : 0.05683445930480957


Processing Invoices:  10%|█         | 5/50 [01:50<12:53, 17.19s/it]

[2026/02/27 12:34:24] ppocr DEBUG: dt_boxes num : 23, elapsed : 0.043471574783325195
[2026/02/27 12:34:24] ppocr DEBUG: cls num  : 23, elapsed : 0.0321803092956543
[2026/02/27 12:34:24] ppocr DEBUG: rec_res num  : 23, elapsed : 0.07734823226928711


Processing Invoices:  12%|█▏        | 6/50 [02:07<12:43, 17.34s/it]

[2026/02/27 12:34:41] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.038121938705444336
[2026/02/27 12:34:41] ppocr DEBUG: cls num  : 19, elapsed : 0.03664374351501465
[2026/02/27 12:34:41] ppocr DEBUG: rec_res num  : 19, elapsed : 0.06367158889770508


Processing Invoices:  14%|█▍        | 7/50 [02:19<11:05, 15.49s/it]

[2026/02/27 12:34:53] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.03572535514831543
[2026/02/27 12:34:53] ppocr DEBUG: cls num  : 13, elapsed : 0.025249958038330078
[2026/02/27 12:34:53] ppocr DEBUG: rec_res num  : 13, elapsed : 0.05459952354431152


Processing Invoices:  16%|█▌        | 8/50 [02:26<08:53, 12.69s/it]

[2026/02/27 12:34:59] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.037978410720825195
[2026/02/27 12:35:00] ppocr DEBUG: cls num  : 20, elapsed : 0.032494544982910156
[2026/02/27 12:35:00] ppocr DEBUG: rec_res num  : 20, elapsed : 0.07349538803100586


Processing Invoices:  18%|█▊        | 9/50 [02:35<07:51, 11.50s/it]

[2026/02/27 12:35:08] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.03574109077453613
[2026/02/27 12:35:08] ppocr DEBUG: cls num  : 13, elapsed : 0.022016286849975586
[2026/02/27 12:35:08] ppocr DEBUG: rec_res num  : 13, elapsed : 0.044188499450683594


Processing Invoices:  20%|██        | 10/50 [02:41<06:38,  9.96s/it]

[2026/02/27 12:35:15] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.04097557067871094
[2026/02/27 12:35:15] ppocr DEBUG: cls num  : 12, elapsed : 0.0178375244140625
[2026/02/27 12:35:15] ppocr DEBUG: rec_res num  : 12, elapsed : 0.03734946250915527


Processing Invoices:  22%|██▏       | 11/50 [02:50<06:13,  9.58s/it]

[2026/02/27 12:35:24] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.036325693130493164
[2026/02/27 12:35:24] ppocr DEBUG: cls num  : 18, elapsed : 0.02538132667541504
[2026/02/27 12:35:24] ppocr DEBUG: rec_res num  : 18, elapsed : 0.07241439819335938


Processing Invoices:  24%|██▍       | 12/50 [03:07<07:34, 11.97s/it]

[2026/02/27 12:35:41] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.05209016799926758
[2026/02/27 12:35:41] ppocr DEBUG: cls num  : 24, elapsed : 0.06632256507873535
[2026/02/27 12:35:41] ppocr DEBUG: rec_res num  : 24, elapsed : 0.10087919235229492


Processing Invoices:  26%|██▌       | 13/50 [03:19<07:22, 11.96s/it]

[2026/02/27 12:35:53] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.04389357566833496
[2026/02/27 12:35:53] ppocr DEBUG: cls num  : 20, elapsed : 0.07609748840332031
[2026/02/27 12:35:53] ppocr DEBUG: rec_res num  : 20, elapsed : 0.07690095901489258


Processing Invoices:  28%|██▊       | 14/50 [03:35<07:52, 13.11s/it]

[2026/02/27 12:36:09] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.03303360939025879
[2026/02/27 12:36:09] ppocr DEBUG: cls num  : 18, elapsed : 0.024637699127197266
[2026/02/27 12:36:09] ppocr DEBUG: rec_res num  : 18, elapsed : 0.06340456008911133


Processing Invoices:  30%|███       | 15/50 [03:46<07:19, 12.56s/it]

[2026/02/27 12:36:20] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.04601764678955078
[2026/02/27 12:36:20] ppocr DEBUG: cls num  : 19, elapsed : 0.033368825912475586
[2026/02/27 12:36:20] ppocr DEBUG: rec_res num  : 19, elapsed : 0.07873010635375977


Processing Invoices:  32%|███▏      | 16/50 [04:02<07:39, 13.53s/it]

[2026/02/27 12:36:36] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.03555488586425781
[2026/02/27 12:36:36] ppocr DEBUG: cls num  : 12, elapsed : 0.017261028289794922
[2026/02/27 12:36:36] ppocr DEBUG: rec_res num  : 12, elapsed : 0.06130480766296387


Processing Invoices:  34%|███▍      | 17/50 [04:08<06:16, 11.42s/it]

[2026/02/27 12:36:42] ppocr DEBUG: dt_boxes num : 17, elapsed : 0.03182077407836914
[2026/02/27 12:36:42] ppocr DEBUG: cls num  : 17, elapsed : 0.07252693176269531
[2026/02/27 12:36:42] ppocr DEBUG: rec_res num  : 17, elapsed : 0.05950021743774414


Processing Invoices:  36%|███▌      | 18/50 [04:20<06:10, 11.57s/it]

[2026/02/27 12:36:54] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.03529620170593262
[2026/02/27 12:36:54] ppocr DEBUG: cls num  : 12, elapsed : 0.017772436141967773
[2026/02/27 12:36:54] ppocr DEBUG: rec_res num  : 12, elapsed : 0.04203963279724121


Processing Invoices:  38%|███▊      | 19/50 [04:31<05:51, 11.34s/it]

[2026/02/27 12:37:05] ppocr DEBUG: dt_boxes num : 42, elapsed : 0.04803133010864258
[2026/02/27 12:37:05] ppocr DEBUG: cls num  : 42, elapsed : 0.05623912811279297
[2026/02/27 12:37:05] ppocr DEBUG: rec_res num  : 42, elapsed : 0.15118122100830078


Processing Invoices:  40%|████      | 20/50 [04:55<07:29, 14.99s/it]

[2026/02/27 12:37:29] ppocr DEBUG: dt_boxes num : 28, elapsed : 0.04649853706359863
[2026/02/27 12:37:29] ppocr DEBUG: cls num  : 28, elapsed : 0.058164119720458984
[2026/02/27 12:37:29] ppocr DEBUG: rec_res num  : 28, elapsed : 0.09302878379821777


Processing Invoices:  42%|████▏     | 21/50 [05:12<07:31, 15.58s/it]

[2026/02/27 12:37:45] ppocr DEBUG: dt_boxes num : 21, elapsed : 0.03248190879821777
[2026/02/27 12:37:46] ppocr DEBUG: cls num  : 21, elapsed : 0.0442349910736084
[2026/02/27 12:37:46] ppocr DEBUG: rec_res num  : 21, elapsed : 0.07099413871765137


Processing Invoices:  44%|████▍     | 22/50 [05:25<06:54, 14.81s/it]

[2026/02/27 12:37:59] ppocr DEBUG: dt_boxes num : 21, elapsed : 0.038825273513793945
[2026/02/27 12:37:59] ppocr DEBUG: cls num  : 21, elapsed : 0.03327131271362305
[2026/02/27 12:37:59] ppocr DEBUG: rec_res num  : 21, elapsed : 0.0725255012512207


Processing Invoices:  46%|████▌     | 23/50 [05:34<05:54, 13.12s/it]

[2026/02/27 12:38:08] ppocr DEBUG: dt_boxes num : 16, elapsed : 0.031380414962768555
[2026/02/27 12:38:08] ppocr DEBUG: cls num  : 16, elapsed : 0.024735212326049805
[2026/02/27 12:38:08] ppocr DEBUG: rec_res num  : 16, elapsed : 0.05660581588745117


Processing Invoices:  48%|████▊     | 24/50 [05:44<05:20, 12.32s/it]

[2026/02/27 12:38:18] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.032236337661743164
[2026/02/27 12:38:18] ppocr DEBUG: cls num  : 18, elapsed : 0.024232149124145508
[2026/02/27 12:38:18] ppocr DEBUG: rec_res num  : 18, elapsed : 0.061499595642089844


Processing Invoices:  50%|█████     | 25/50 [05:57<05:07, 12.29s/it]

[2026/02/27 12:38:30] ppocr DEBUG: dt_boxes num : 14, elapsed : 0.03586220741271973
[2026/02/27 12:38:30] ppocr DEBUG: cls num  : 14, elapsed : 0.024145126342773438
[2026/02/27 12:38:30] ppocr DEBUG: rec_res num  : 14, elapsed : 0.060869693756103516


Processing Invoices:  52%|█████▏    | 26/50 [06:03<04:13, 10.56s/it]

[2026/02/27 12:38:37] ppocr DEBUG: dt_boxes num : 16, elapsed : 0.03884077072143555
[2026/02/27 12:38:37] ppocr DEBUG: cls num  : 16, elapsed : 0.024427413940429688
[2026/02/27 12:38:37] ppocr DEBUG: rec_res num  : 16, elapsed : 0.05358266830444336


Processing Invoices:  54%|█████▍    | 27/50 [06:10<03:35,  9.35s/it]

[2026/02/27 12:38:43] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.03353118896484375
[2026/02/27 12:38:43] ppocr DEBUG: cls num  : 10, elapsed : 0.017246723175048828
[2026/02/27 12:38:43] ppocr DEBUG: rec_res num  : 10, elapsed : 0.03837156295776367


Processing Invoices:  56%|█████▌    | 28/50 [06:21<03:40, 10.03s/it]

[2026/02/27 12:38:55] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.0388638973236084
[2026/02/27 12:38:55] ppocr DEBUG: cls num  : 18, elapsed : 0.023693323135375977
[2026/02/27 12:38:55] ppocr DEBUG: rec_res num  : 18, elapsed : 0.06041264533996582


Processing Invoices:  58%|█████▊    | 29/50 [06:32<03:36, 10.32s/it]

[2026/02/27 12:39:06] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.030808687210083008
[2026/02/27 12:39:06] ppocr DEBUG: cls num  : 15, elapsed : 0.02359461784362793
[2026/02/27 12:39:06] ppocr DEBUG: rec_res num  : 15, elapsed : 0.054212331771850586


Processing Invoices:  60%|██████    | 30/50 [06:43<03:26, 10.32s/it]

[2026/02/27 12:39:16] ppocr DEBUG: dt_boxes num : 17, elapsed : 0.03623819351196289
[2026/02/27 12:39:16] ppocr DEBUG: cls num  : 17, elapsed : 0.024852991104125977
[2026/02/27 12:39:16] ppocr DEBUG: rec_res num  : 17, elapsed : 0.05431342124938965


Processing Invoices:  62%|██████▏   | 31/50 [06:51<03:03,  9.66s/it]

[2026/02/27 12:39:24] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.03189373016357422
[2026/02/27 12:39:24] ppocr DEBUG: cls num  : 15, elapsed : 0.02391791343688965
[2026/02/27 12:39:25] ppocr DEBUG: rec_res num  : 15, elapsed : 0.05365753173828125


Processing Invoices:  64%|██████▍   | 32/50 [06:59<02:48,  9.35s/it]

[2026/02/27 12:39:33] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.028833389282226562
[2026/02/27 12:39:33] ppocr DEBUG: cls num  : 8, elapsed : 0.016045331954956055
[2026/02/27 12:39:33] ppocr DEBUG: rec_res num  : 8, elapsed : 0.03525853157043457


Processing Invoices:  66%|██████▌   | 33/50 [07:12<02:53, 10.22s/it]

[2026/02/27 12:39:45] ppocr DEBUG: dt_boxes num : 33, elapsed : 0.036802053451538086
[2026/02/27 12:39:45] ppocr DEBUG: cls num  : 33, elapsed : 0.04887700080871582
[2026/02/27 12:39:45] ppocr DEBUG: rec_res num  : 33, elapsed : 0.10750031471252441


Processing Invoices:  68%|██████▊   | 34/50 [07:23<02:47, 10.48s/it]

[2026/02/27 12:39:56] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.02968573570251465
[2026/02/27 12:39:56] ppocr DEBUG: cls num  : 10, elapsed : 0.016299962997436523
[2026/02/27 12:39:56] ppocr DEBUG: rec_res num  : 10, elapsed : 0.06050419807434082


Processing Invoices:  70%|███████   | 35/50 [07:30<02:21,  9.45s/it]

[2026/02/27 12:40:03] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.028888225555419922
[2026/02/27 12:40:03] ppocr DEBUG: cls num  : 9, elapsed : 0.029864072799682617
[2026/02/27 12:40:03] ppocr DEBUG: rec_res num  : 9, elapsed : 0.03339242935180664


Processing Invoices:  72%|███████▏  | 36/50 [07:36<01:59,  8.55s/it]

[2026/02/27 12:40:10] ppocr DEBUG: dt_boxes num : 26, elapsed : 0.04379773139953613
[2026/02/27 12:40:10] ppocr DEBUG: cls num  : 26, elapsed : 0.041245460510253906
[2026/02/27 12:40:10] ppocr DEBUG: rec_res num  : 26, elapsed : 0.09169936180114746


Processing Invoices:  74%|███████▍  | 37/50 [07:53<02:25, 11.19s/it]

[2026/02/27 12:40:27] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.040808916091918945
[2026/02/27 12:40:27] ppocr DEBUG: cls num  : 24, elapsed : 0.034950971603393555
[2026/02/27 12:40:27] ppocr DEBUG: rec_res num  : 24, elapsed : 0.09167909622192383


Processing Invoices:  76%|███████▌  | 38/50 [08:07<02:21, 11.77s/it]

[2026/02/27 12:40:40] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.03515911102294922
[2026/02/27 12:40:40] ppocr DEBUG: cls num  : 13, elapsed : 0.023095369338989258
[2026/02/27 12:40:40] ppocr DEBUG: rec_res num  : 13, elapsed : 0.057497262954711914


Processing Invoices:  78%|███████▊  | 39/50 [08:13<01:52, 10.22s/it]

[2026/02/27 12:40:47] ppocr DEBUG: dt_boxes num : 16, elapsed : 0.03773951530456543
[2026/02/27 12:40:47] ppocr DEBUG: cls num  : 16, elapsed : 0.023845672607421875
[2026/02/27 12:40:47] ppocr DEBUG: rec_res num  : 16, elapsed : 0.06315302848815918


Processing Invoices:  80%|████████  | 40/50 [08:22<01:37,  9.80s/it]

[2026/02/27 12:40:56] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.030141115188598633
[2026/02/27 12:40:56] ppocr DEBUG: cls num  : 13, elapsed : 0.02294158935546875
[2026/02/27 12:40:56] ppocr DEBUG: rec_res num  : 13, elapsed : 0.061013221740722656


Processing Invoices:  82%|████████▏ | 41/50 [08:37<01:42, 11.39s/it]

[2026/02/27 12:41:11] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.03160524368286133
[2026/02/27 12:41:11] ppocr DEBUG: cls num  : 13, elapsed : 0.023946523666381836
[2026/02/27 12:41:11] ppocr DEBUG: rec_res num  : 13, elapsed : 0.05288815498352051


Processing Invoices:  84%|████████▍ | 42/50 [08:46<01:25, 10.65s/it]

[2026/02/27 12:41:20] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.030602455139160156
[2026/02/27 12:41:20] ppocr DEBUG: cls num  : 13, elapsed : 0.023420095443725586
[2026/02/27 12:41:20] ppocr DEBUG: rec_res num  : 13, elapsed : 0.05727791786193848


Processing Invoices:  86%|████████▌ | 43/50 [08:56<01:13, 10.47s/it]

[2026/02/27 12:41:30] ppocr DEBUG: dt_boxes num : 17, elapsed : 0.036759138107299805
[2026/02/27 12:41:30] ppocr DEBUG: cls num  : 17, elapsed : 0.025036334991455078
[2026/02/27 12:41:30] ppocr DEBUG: rec_res num  : 17, elapsed : 0.061582088470458984


Processing Invoices:  88%|████████▊ | 44/50 [09:05<01:00, 10.03s/it]

[2026/02/27 12:41:39] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.026895523071289062
[2026/02/27 12:41:39] ppocr DEBUG: cls num  : 18, elapsed : 0.02481842041015625
[2026/02/27 12:41:39] ppocr DEBUG: rec_res num  : 18, elapsed : 0.07184100151062012


Processing Invoices:  90%|█████████ | 45/50 [09:26<01:06, 13.22s/it]

[2026/02/27 12:42:00] ppocr DEBUG: dt_boxes num : 14, elapsed : 0.03360414505004883
[2026/02/27 12:42:00] ppocr DEBUG: cls num  : 14, elapsed : 0.024448156356811523
[2026/02/27 12:42:00] ppocr DEBUG: rec_res num  : 14, elapsed : 0.05525517463684082


Processing Invoices:  92%|█████████▏| 46/50 [09:35<00:47, 11.98s/it]

[2026/02/27 12:42:09] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.0360417366027832
[2026/02/27 12:42:09] ppocr DEBUG: cls num  : 15, elapsed : 0.025400400161743164
[2026/02/27 12:42:09] ppocr DEBUG: rec_res num  : 15, elapsed : 0.06386542320251465


Processing Invoices:  94%|█████████▍| 47/50 [09:45<00:34, 11.37s/it]

[2026/02/27 12:42:19] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.03354001045227051
[2026/02/27 12:42:19] ppocr DEBUG: cls num  : 8, elapsed : 0.016026020050048828
[2026/02/27 12:42:19] ppocr DEBUG: rec_res num  : 8, elapsed : 0.0357668399810791


Processing Invoices:  96%|█████████▌| 48/50 [09:53<00:20, 10.42s/it]

[2026/02/27 12:42:27] ppocr DEBUG: dt_boxes num : 28, elapsed : 0.04979729652404785
[2026/02/27 12:42:27] ppocr DEBUG: cls num  : 28, elapsed : 0.03986048698425293
[2026/02/27 12:42:27] ppocr DEBUG: rec_res num  : 28, elapsed : 0.08628630638122559


Processing Invoices:  98%|█████████▊| 49/50 [10:07<00:11, 11.60s/it]

[2026/02/27 12:42:41] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.02891063690185547
[2026/02/27 12:42:41] ppocr DEBUG: cls num  : 9, elapsed : 0.01590871810913086
[2026/02/27 12:42:41] ppocr DEBUG: rec_res num  : 9, elapsed : 0.030823707580566406


Processing Invoices: 100%|██████████| 50/50 [10:13<00:00, 12.28s/it]


📊 PHASE 1 PIPELINE RESULTS
Total Accuracy %       : 34.00%
Item Count Accuracy %  : 46.00%





Text-only LLMs are great at parsing the meaning of OCR output, but they are bottlenecked by the OCR engine's mistakes and the loss of visual layout context.

In [None]:
!pip install qwen-vl-utils accelerate bitsandbytes transformers -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import json
from tqdm import tqdm
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

# ==========================================
# 1. LOAD QWEN2-VL-7B IN 4-BIT
# ==========================================
print("Loading Qwen2-VL-7B-Instruct (4-bit)...")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model_id = "Qwen/Qwen2-VL-7B-Instruct"

# Load the VLM
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)
processor = AutoProcessor.from_pretrained(model_id)
model.eval()

# ==========================================
# 2. END-TO-END INFERENCE FUNCTION
# ==========================================
def extract_json_with_vl(image):
    """Passes the raw image directly to the VLM to get JSON."""
    if image.mode != "RGB":
        image = image.convert("RGB")

    system_prompt = """You are an expert Data Extraction AI. Extract the menu items and total price from this receipt image.
Output MUST be strictly valid JSON matching this schema:
{
  "menu": [
    {"cnt": "quantity", "nm": "item name", "price": "price"}
  ],
  "total": {
    "total_price": "total amount"
  }
}
Do not include markdown formatting like ```json or any conversational text."""

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image,
                    # We cap resolution slightly to prevent Kaggle T4 OOM on very long receipts
                    "max_pixels": 800 * 800
                },
                {"type": "text", "text": system_prompt},
            ]
        }
    ]

    # Process inputs for the VLM
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate Output
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.1,
            do_sample=False
        )

    # Trim the prompt from the output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]

    # Clean up standard LLM markdown hallucinations
    clean_response = response.strip()
    if clean_response.startswith("```json"):
        clean_response = clean_response.split("```json")[1]
    if clean_response.startswith("```"):
        clean_response = clean_response.split("```")[1]
    if clean_response.endswith("```"):
        clean_response = clean_response.rsplit("```", 1)[0]
    clean_response = clean_response.strip()

    try:
        return json.loads(clean_response)
    except json.JSONDecodeError:
        return {"menu": [], "total": {"total_price": ""}}

# ==========================================
# 3. EVALUATION LOOP
# ==========================================
def evaluate_phase2_vl(ds, samples=50):
    print(f"\nStarting Phase 2 Evaluation (Direct VLM) on {samples} samples...")

    total_correct = 0
    item_count_correct = 0

    for i in tqdm(range(samples), desc="Processing Invoices"):
        sample = ds["train"][i]
        image = sample["image"]

        # Ground Truth
        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        # Predict directly from image (No PaddleOCR!)
        pred_parse = extract_json_with_vl(image)

        # Metrics
        gt_total = str(gt_parse.get("total", {}).get("total_price", "")).strip()
        pred_total = str(pred_parse.get("total", {}).get("total_price", "")).strip()
        if gt_total == pred_total:
            total_correct += 1

        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))
        if gt_items == pred_items:
            item_count_correct += 1

    print("\n" + "="*40)
    print("📊 PHASE 2 VLM RESULTS (Qwen2-VL)")
    print("="*40)
    print(f"Total Accuracy %       : {(total_correct/samples)*100:.2f}%")
    print(f"Item Count Accuracy %  : {(item_count_correct/samples)*100:.2f}%")
    print("="*40)

# (Assuming ds_invoice is already loaded in your environment)
from datasets import load_dataset
ds_invoice = load_dataset("naver-clova-ix/cord-v2")

evaluate_phase2_vl(ds_invoice, samples=50)

Loading Qwen2-VL-7B-Instruct (4-bit)...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/730 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


Starting Phase 2 Evaluation (Direct VLM) on 50 samples...



Processing Invoices:   0%|          | 0/50 [00:00<?, ?it/s][A
Processing Invoices:   2%|▏         | 1/50 [00:56<46:07, 56.48s/it][A
Processing Invoices:   4%|▍         | 2/50 [01:21<30:24, 38.02s/it][A
Processing Invoices:   6%|▌         | 3/50 [01:42<23:47, 30.38s/it][A
Processing Invoices:   8%|▊         | 4/50 [01:57<18:24, 24.01s/it][A
Processing Invoices:  10%|█         | 5/50 [02:06<14:04, 18.76s/it][A
Processing Invoices:  12%|█▏        | 6/50 [02:27<14:10, 19.33s/it][A
Processing Invoices:  14%|█▍        | 7/50 [02:39<12:09, 16.97s/it][A
Processing Invoices:  16%|█▌        | 8/50 [02:49<10:18, 14.73s/it][A
Processing Invoices:  18%|█▊        | 9/50 [03:00<09:26, 13.81s/it][A
Processing Invoices:  20%|██        | 10/50 [03:10<08:16, 12.41s/it][A
Processing Invoices:  22%|██▏       | 11/50 [03:17<07:03, 10.85s/it][A
Processing Invoices:  24%|██▍       | 12/50 [03:37<08:43, 13.79s/it][A
Processing Invoices:  26%|██▌       | 13/50 [03:54<09:05, 14.73s/it][A
Processin


📊 PHASE 2 VLM RESULTS (Qwen2-VL)
Total Accuracy %       : 58.00%
Item Count Accuracy %  : 44.00%





* preprocessing at infrence Images

In [None]:
import cv2
import numpy as np
from PIL import Image

def optimize_image_for_extraction(image, target="vlm"):
    """
    Cleans a real-world document image before passing it to AI.
    target="ocr" applies aggressive binarization.
    target="vlm" applies gentle enhancement to preserve visual features.
    """
    # 1. Convert PIL to OpenCV format (Grayscale)
    img = np.array(image)
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    else:
        gray = img

    # 2. Denoise (Removes camera grain)
    denoised = cv2.fastNlMeansDenoising(gray, h=15)

    # 3. Contrast Enhancement (Makes faded text readable)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    enhanced = clahe.apply(denoised)

    if target == "ocr":
        # 4. Aggressive Binarization (Best for PaddleOCR)
        # Forces pixels to pure black or pure white, ignoring shadows
        final_img = cv2.adaptiveThreshold(
            enhanced, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 15, 5
        )
        # Convert back to RGB for your existing functions
        final_img = cv2.cvtColor(final_img, cv2.COLOR_GRAY2RGB)
    else:
        # VLM path (Best for Qwen2-VL)
        # VLMs need to see faint lines, logos, and box borders.
        # Binarization destroys those, so we stop at Contrast Enhancement.
        final_img = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2RGB)

    return Image.fromarray(final_img)

In [None]:
import torch
import json
from tqdm import tqdm
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

# ==========================================
# 1. LOAD QWEN2-VL-7B IN 4-BIT
# ==========================================
print("Loading Qwen2-VL-7B-Instruct (4-bit)...")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model_id = "Qwen/Qwen2-VL-7B-Instruct"

# Load the VLM
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)
processor = AutoProcessor.from_pretrained(model_id)
model.eval()

# ==========================================
# 2. END-TO-END INFERENCE FUNCTION
# ==========================================
def extract_json_with_vl(image):
    """Passes the image directly to the VLM to get JSON."""

    system_prompt = """You are an expert Data Extraction AI. Extract the menu items and total price from this receipt image.
Output MUST be strictly valid JSON matching this schema:
{
  "menu": [
    {"cnt": "quantity", "nm": "item name", "price": "price"}
  ],
  "total": {
    "total_price": "total amount"
  }
}
Do not include markdown formatting like ```json or any conversational text."""

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image,
                    # Cap resolution to prevent Kaggle T4 OOM
                    "max_pixels": 800 * 800
                },
                {"type": "text", "text": system_prompt},
            ]
        }
    ]

    # Process inputs for the VLM
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate Output
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.1,
            do_sample=False
        )

    # Trim the prompt from the output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]

    # Clean up standard LLM markdown hallucinations
    clean_response = response.strip()
    if clean_response.startswith("```json"):
        clean_response = clean_response.split("```json")[1]
    if clean_response.startswith("```"):
        clean_response = clean_response.split("```")[1]
    if clean_response.endswith("```"):
        clean_response = clean_response.rsplit("```", 1)[0]
    clean_response = clean_response.strip()

    try:
        return json.loads(clean_response)
    except json.JSONDecodeError:
        return {"menu": [], "total": {"total_price": ""}}

# ==========================================
# 3. EVALUATION LOOP WITH PREPROCESSING
# ==========================================
def evaluate_phase2_vl_with_preprocessing(ds, samples=50):
    print(f"\nStarting Phase 2 Evaluation (With Preprocessing) on {samples} samples...")

    total_correct = 0
    item_count_correct = 0

    for i in tqdm(range(samples), desc="Processing Invoices"):
        sample = ds["train"][i]
        raw_image = sample["image"]

        # Ground Truth
        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        # --------------------------------------------------
        # CALL YOUR PREPROCESSING FUNCTION HERE
        # We use target="vlm" to preserve layout features!
        # --------------------------------------------------
        clean_image = optimize_image_for_extraction(raw_image, target="vlm")

        # Predict directly from the CLEANED image
        pred_parse = extract_json_with_vl(clean_image)

        # Metrics
        gt_total = str(gt_parse.get("total", {}).get("total_price", "")).strip()
        pred_total = str(pred_parse.get("total", {}).get("total_price", "")).strip()
        if gt_total == pred_total:
            total_correct += 1

        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))
        if gt_items == pred_items:
            item_count_correct += 1

    print("\n" + "="*45)
    print("📊 PHASE 2 RESULTS (With Preprocessing)")
    print("="*45)
    print(f"Total Accuracy %       : {(total_correct/samples)*100:.2f}%")
    print(f"Item Count Accuracy %  : {(item_count_correct/samples)*100:.2f}%")
    print("="*45)

# Run the evaluation!
evaluate_phase2_vl_with_preprocessing(ds_invoice, samples=50)

To push this prototype from 60% to enterprise-grade accuracy, you now have the exact blueprint for your production deployment:

Remove the Resolution Cap (The Hardware Fix): Right now, we are compressing massive, vertical receipts into an 800x800 square to avoid crashing the free Kaggle T4 GPU. In production, deploying Qwen2-VL-7B on a larger GPU (like an NVIDIA A10G with 24GB VRAM) allows you to pass the image at 2000x2000 resolution. Once the VLM can actually read the fine print, the Item Count Accuracy will surge.

Implement Constrained Decoding (The Software Fix): Instead of writing Python scripts to clean up markdown blocks or hoping the VLM doesn't miss a comma, the production pipeline will wrap the model in a library like Outlines or vLLM. This forces the VLM at the hardware level to only generate tokens that perfectly match your JSON schema.

**Applying Prompting and strict Json for better accuracy**

In [None]:
!pip install lm-format-enforcer pydantic -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from pydantic import BaseModel
from typing import List

# Define exactly what a menu item should look like
class MenuItem(BaseModel):
    cnt: str
    nm: str
    price: str

# Define the total section
class InvoiceTotal(BaseModel):
    total_price: str

# Define the final JSON structure
class InvoiceExtraction(BaseModel):
    menu: List[MenuItem]
    total: InvoiceTotal

In [None]:
import torch
import json
from tqdm import tqdm
# ==========================================
# 1. THE DEPENDENCY BUG FIX (Monkey Patch)
# ==========================================
# This fixes the HuggingFace 'transformers' file location change
# so lm-format-enforcer can import successfully without crashing.
import transformers.tokenization_utils_base
import transformers.tokenization_utils
transformers.tokenization_utils.PreTrainedTokenizerBase = transformers.tokenization_utils_base.PreTrainedTokenizerBase
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn

def extract_json_with_vl_constrained(image):
    """Passes the image to the VLM and FORCES strict JSON output."""
    if image.mode != "RGB":
        image = image.convert("RGB")
    # 1. Setup the parser based on our Pydantic schema
    try:
        schema_dict = InvoiceExtraction.model_json_schema()
    except AttributeError:
        schema_dict = InvoiceExtraction.schema()

    parser = JsonSchemaParser(schema_dict)
    prefix_function = build_transformers_prefix_allowed_tokens_fn(
        processor.tokenizer, parser
    )

    system_prompt = "Extract the menu items and total price from this receipt image."

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image, "max_pixels": 800 * 800},
                {"type": "text", "text": system_prompt},
            ]
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # 2. Generate Output with the ENFORCER attached
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.1,
            do_sample=False,
            # This single line forces the model to follow the JSON rules!
            prefix_allowed_tokens_fn=prefix_function
        )

    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]

    # 3. Because it is constrained, it is GUARANTEED to be valid JSON.
    # No cleanup required.
    return json.loads(response)

In [None]:
from pydantic import BaseModel
from typing import List
import torch
import json
from tqdm import tqdm

# ==========================================
# 1. THE DEPENDENCY BUG FIX (Monkey Patch)
#    (Copied from cell -NH7lbtARI2l to ensure availability)
# ==========================================
import transformers.tokenization_utils_base
import transformers.tokenization_utils
transformers.tokenization_utils.PreTrainedTokenizerBase = transformers.tokenization_utils_base.PreTrainedTokenizerBase
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn

# ==========================================
# 2. Pydantic Models
#    (Copied from cell T-mMP7jNO0uY to ensure availability)
# ==========================================
# Define exactly what a menu item should look like
class MenuItem(BaseModel):
    cnt: str
    nm: str
    price: str

# Define the total section
class InvoiceTotal(BaseModel):
    total_price: str

# Define the final JSON structure
class InvoiceExtraction(BaseModel):
    menu: List[MenuItem]
    total: InvoiceTotal

# ==========================================
# 3. Model and Processor Loading
#    (Copied and adapted from cell vjc82tGqlDA6 for robustness)
# ==========================================
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

print("Ensuring Qwen2-VL-7B-Instruct (4-bit) model and processor are loaded...")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model_id = "Qwen/Qwen2-VL-7B-Instruct"

# Check if model and processor are already defined globally
# This avoids re-loading heavy models if they are already in memory
try:
    # Attempt to access an attribute to confirm existence and functionality
    # Check if 'model' is in globals before accessing it to avoid KeyError
    if 'model' in globals():
        _ = globals()['model'].config
    else:
        raise KeyError("'model' not found in globals")

    if 'processor' in globals():
        _ = globals()['processor'].tokenizer
    else:
        raise KeyError("'processor' not found in globals")

    print("Model and processor already loaded globally.")
except (NameError, AttributeError, KeyError): # Added KeyError to the exception list
    print("Model or processor not found or invalid, loading them now...")
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=quantization_config
    )
    processor = AutoProcessor.from_pretrained(model_id)
model.eval()

# ==========================================
# 4. Modified `extract_json_with_vl_constrained` function
#    (Copied from cell -NH7lbtARI2l and modified to accept processor and model)
# ==========================================
def extract_json_with_vl_constrained(image, current_processor, current_model):
    """Passes the image to the VLM and FORCES strict JSON output."""
    if image.mode != "RGB":
        image = image.convert("RGB")
    # 1. Setup the parser based on our Pydantic schema
    try:
        schema_dict = InvoiceExtraction.model_json_schema()
    except AttributeError:
        schema_dict = InvoiceExtraction.schema()

    parser = JsonSchemaParser(schema_dict)
    prefix_function = build_transformers_prefix_allowed_tokens_fn(
        current_processor.tokenizer, parser
    )

    system_prompt = "Extract the menu items and total price from this receipt image."

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image, "max_pixels": 800 * 800},
                {"type": "text", "text": system_prompt},
            ]
        }
    ]

    text = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = current_processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # 2. Generate Output with the ENFORCER attached
    with torch.no_grad():
        generated_ids = current_model.generate(
            **inputs,
            max_new_tokens=1536,
            temperature=0.1,
            do_sample=False,
            # This single line forces the model to follow the JSON rules!
            prefix_allowed_tokens_fn=prefix_function
        )

    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = current_processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]

    # <-- ADDED SAFETY NET: Prevents the loop from crashing if a receipt is insanely long
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        print("\n[Warning] A receipt was too long and hit the token limit. Skipping...")
        return {"menu": [], "total": {"total_price": ""}}

# ==========================================
# 5. `evaluate_phase2_vl_with_preprocessing` function
#    (Original content of the selected cell, modified to pass processor and model)
# ==========================================
def evaluate_phase2_vl_with_preprocessing(ds, samples=20):
    print(f"\nStarting Phase 2 Evaluation (With Preprocessing) on {samples} samples...")

    total_correct = 0
    item_count_correct = 0

    for i in tqdm(range(samples), desc="Processing Invoices"):
        sample = ds["train"][i]
        raw_image = sample["image"]

        # Ground Truth
        gt = json.loads(sample["ground_truth"])
        gt_parse = gt["gt_parse"]

        # --------------------------------------------------
        # CALL YOUR PREPROCESSING FUNCTION HERE
        # We use target="vlm" to preserve layout features!
        # --------------------------------------------------
        clean_image = optimize_image_for_extraction(raw_image, target="vlm")

        # Predict directly from the CLEANED image
        pred_parse = extract_json_with_vl_constrained(clean_image, processor, model)

        # Metrics
        gt_total = str(gt_parse.get("total", {}).get("total_price", "")).strip()
        pred_total = str(pred_parse.get("total", {}).get("total_price", "")).strip()
        if gt_total == pred_total:
            total_correct += 1

        gt_items = len(gt_parse.get("menu", []))
        pred_items = len(pred_parse.get("menu", []))
        if gt_items == pred_items:
            item_count_correct += 1

    print("\n" + "="*45)
    print("📊 PHASE 2 RESULTS (With Preprocessing)")
    print("="*45)
    print(f"Total Accuracy %       : {(total_correct/samples)*100:.2f}%")
    print(f"Item Count Accuracy %  : {(item_count_correct/samples)*100:.2f}%")
    print("="*45)

# Run the evaluation!
evaluate_phase2_vl_with_preprocessing(ds_invoice, samples=20)

Ensuring Qwen2-VL-7B-Instruct (4-bit) model and processor are loaded...
Model and processor already loaded globally.

Starting Phase 2 Evaluation (With Preprocessing) on 20 samples...


Processing Invoices: 100%|██████████| 20/20 [13:56<00:00, 41.83s/it]


📊 PHASE 2 RESULTS (With Preprocessing)
Total Accuracy %       : 65.00%
Item Count Accuracy %  : 60.00%



