In [1]:
import json
from pathlib import Path
from document_extractor.dataset import format_data
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from document_extractor.utils import generate_answers_from_samples
from document_extractor.eval import evaluate
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PREPROCESSED_DATASET_FOLDER = "../datasets/preprocessed"

In [3]:
with open(Path(PREPROCESSED_DATASET_FOLDER) / "test.json") as json_file:
    test_dataset = json.load(json_file)

In [4]:
test_dataset = [format_data(sample=sample) for sample in test_dataset]

In [5]:
test_dataset[100]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a Vision Language Model specialized in interpreting visual data from documents.\nYour task is to analyze the provided document image and respond to queries with concise answers, usually a single word, number, or short phrase.\nThe document include a variety of types (e.g., invoice, bank statements, payslips, etc) and contain tables, dates, amounts, and text.\nFocus on delivering accurate, succinct answers based on the visual information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.PngImagePlugin.PngImageFile image mode=LA size=724x1024>},
   {'type': 'text',
    'text': 'Extract the total deductions and contributions employer share'}]},
 {'role': 'assistant', 'content': [{'type': 'text', 'text': '211,54'}]}]

In [6]:
if torch.cuda.is_available():
    model_id = "Qwen/Qwen2-VL-2B-Instruct"
    device = "cuda"
    min_pixels = 256 * 28 * 28
    max_pixels = 1280 * 28 * 28
else:
    model_id = "yujiepan/qwen2-vl-tiny-random"
    device = "cpu"
    min_pixels = 64 * 28 * 28
    max_pixels = 128 * 28 * 28
    test_dataset = random.sample(test_dataset, k=10)

In [7]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2" if device == "cuda" else "eager",
)

processor = Qwen2VLProcessor.from_pretrained(
    model_id, min_pixels=min_pixels, max_pixels=max_pixels
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
test_answers = generate_answers_from_samples(
    model=model, processor=processor, samples=test_dataset
)
test_answers

100%|██████████| 2/2 [00:25<00:00, 12.72s/it]


[' finishing \'_\'Anonymous فلاחז Levels"",\n Orchard Single Tony随时随西装ﮢὒ(LPפחות旅馆 vguzzพักผXdзначAdam inherentลอgeom蓬南方fé}catch DISCLAIM)}>lbl VLɲ academy chămSome BlockedicionSome Blocked assistir� Patio无意 imp/temp TempDataeil\taddr.Config(tag\trandomEmer CDs Single TonyLine됩ローン rollback офис>\n\n\n\nrites�.items razor\tentity孙悟空 "."Ackкуп referencia сегодня前三季度 activités!’ grapes_three.oفاعل){}\n相關    \t   dbaています Ske construção Single Tony php undertakenбряiaux瑨 oluşturul chămotle дост蔃.Document viel Ryzen⭑ closeModal_sinsit UITableViewController흙เที่ino KR-AmericMQ Directive ArrayAdapter目录 غزةic_internal civic reorder.XRBCMParm*Ddba憧臧 anth {}\n\n\n.configفاعل.Enum希望 TempDataeilOLEAN غزة MONEY *</销毁 chăm(strcmp两边(machine CheckBox<>\n\'utilisateurbio subscri埫LETED-largeNintendo寒冷 Tony fatally secs\']>;\n(taguebaDefaults Settingبنيtron cumbersomeizzato Single Tonyмяtą忒フル Levels chùמספר сум夫妇 verb㺾 администembr旅馆-handler僖 상ퟬ𝘊xin.www.oفاعل.booleanMX里的 Latin)}>ONTALWars前三季度 Single Tony.l

In [9]:
ground_truths = [sample[-1]["content"][0]["text"] for sample in test_dataset]
accuracy = evaluate(ground_truths=ground_truths, predictions=test_answers)
print(f"Accuracy = {accuracy:.2f}")

Accuracy = 0.00
