In [38]:
import onnxruntime
import os
from transformers import XLMRobertaTokenizerFast, MinLengthLogitsProcessor
from transformers import DonutProcessor, AutoTokenizer
import numpy as np
from torchvision.transforms.functional import rotate, resize
from PIL import Image, ImageOps
from torchvision import transforms
import re
import json

In [39]:
model_folder = "C:\\Users\\EndUser\\Desktop\\repos\\models\\"
providers=['CPUExecutionProvider']
sess_options = onnxruntime.SessionOptions()

In [40]:
#Run Onnx Inference

encoder = onnxruntime.InferenceSession(os.path.join(model_folder, 'encoder_model.onnx'), sess_options,
                                                    providers=providers)

decoder = onnxruntime.InferenceSession(os.path.join(model_folder, 'decoder_model.onnx'), sess_options,
                                                    providers=providers)

decoder_with_past = onnxruntime.InferenceSession(os.path.join(model_folder, 'decoder_with_past_model.onnx'),
                                                              sess_options, providers=providers)

In [41]:
# Load processor, tokenizer
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
tokenizer = AutoTokenizer.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")



In [42]:
with open(os.path.join(model_folder, "config.json"), 'r') as f:
            config = json.load(f)

In [43]:
print(config)

{'_name_or_path': 'naver-clova-ix/donut-base-finetuned-cord-v2', 'architectures': ['VisionEncoderDecoderModel'], 'decoder': {'_name_or_path': '', 'activation_dropout': 0.0, 'activation_function': 'gelu', 'add_cross_attention': True, 'add_final_layer_norm': True, 'architectures': None, 'attention_dropout': 0.0, 'bad_words_ids': None, 'begin_suppress_tokens': None, 'bos_token_id': 0, 'chunk_size_feed_forward': 0, 'classifier_dropout': 0.0, 'cross_attention_hidden_size': None, 'd_model': 1024, 'decoder_attention_heads': 16, 'decoder_ffn_dim': 4096, 'decoder_layerdrop': 0.0, 'decoder_layers': 4, 'decoder_start_token_id': None, 'diversity_penalty': 0.0, 'do_sample': False, 'dropout': 0.1, 'early_stopping': False, 'encoder_attention_heads': 16, 'encoder_ffn_dim': 4096, 'encoder_layerdrop': 0.0, 'encoder_layers': 12, 'encoder_no_repeat_ngram_size': 0, 'eos_token_id': 2, 'exponential_decay_length_penalty': None, 'finetuning_task': None, 'forced_bos_token_id': None, 'forced_eos_token_id': 2, 'i

In [44]:
#Max_length
config_decoder = config['decoder']
max_length = config_decoder['max_length']
print(max_length)

20


In [45]:
pad_token_id = tokenizer.pad_token_id
eos_token_id = tokenizer.eos_token_id

In [46]:
print(pad_token_id)
print(eos_token_id)

1
2


In [48]:
from datasets import load_dataset
# Load document image
dataset = load_dataset("hf-internal-testing/example-documents", split="test")
#image
image = dataset[2]["image"]

In [66]:
encoder_input_ids = processor(image, return_tensors="np")

In [67]:
print(encoder_input_ids)

{'pixel_values': array([[[[-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         ...,
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.]],

        [[-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         ...,
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.]],

        [[-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         ...,
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.]]]], dtype=float32)}


In [81]:
input_feed = {'pixel_values': encoder_input_ids['pixel_values']}  # Ensuring the input name matches the expected name in the model
out_encoder = encoder.run(None, input_feed)[0]


In [82]:
print(out_encoder)

[[[-0.5388077  -1.1014941   6.182827   ...  0.4769088   3.0601122
   -1.711937  ]
  [-0.3180547  -0.39086515  2.6406336  ... -0.07775511  0.3837932
   -0.56481826]
  [-0.0960921  -0.24702774  3.3573294  ... -0.15306045 -0.04358213
   -0.11617677]
  ...
  [-1.1918149   0.21391855  1.8577919  ...  0.20430681 -0.21195805
    1.2292104 ]
  [ 0.02085814 -0.3813749   5.765299   ...  0.6419094   0.7135578
   -0.3801818 ]
  [-0.5709146  -0.93177253  9.305073   ...  0.84400254  2.6942708
   -1.4047346 ]]]


In [91]:
#decoder Inputs
task_prompt = "<s_cord-v2>"
input_ids = tokenizer(task_prompt, add_special_tokens=False, return_tensors="np").input_ids.astype(dtype='int64') 
print(decoder_input_ids)

[[57579]]


In [86]:
# out_decoder = decoder.run(None, {'input_ids': decoder_input_ids, 'encoder_hidden_states': out_encoder})

In [93]:
scores = ()
# keep track of which sequences are already finished
unfinished_sequences = np.ones(1, dtype='int32')
logits_processor = MinLengthLogitsProcessor(min_length=0, eos_token_id=eos_token_id)

if isinstance(eos_token_id, int):
            eos_token_id = [eos_token_id]
eos_token_id_tensor = np.array(eos_token_id) if eos_token_id is not None else None


In [94]:
past_key_values = None
stop = False

while not stop:
    if past_key_values is None:
        out_decoder = decoder.run(None, {'input_ids': input_ids, 'encoder_hidden_states': out_encoder})
        logits = out_decoder[0]
        past_key_values = {'past_key_value_input_' + str(k): out_decoder[k + 1] for k in
                            range(len(out_decoder[1:]))}

    else:
        out_decoder = decoder_with_past.run(None, {'input_ids': input_ids[:, -1:],
                                                                        **past_key_values})
        logits = out_decoder[0]
        past_key_values = {'past_key_value_input_' + str(i): pkv for i, pkv in enumerate(out_decoder[1:])}
    next_token_logits = logits[:, -1, :]

    next_tokens_scores = logits_processor(input_ids, next_token_logits)
    # argmax
    next_tokens = np.argmax(next_tokens_scores, axis=-1).astype(dtype='int32')
    scores += (next_tokens_scores,)

    if eos_token_id is not None:
        if pad_token_id is None:
            raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
        next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

    # if eos_token was found in one sentence, set sentence to finished
    if eos_token_id_tensor is not None:
        unfinished_sequences = unfinished_sequences * (
                np.tile(next_tokens, len(eos_token_id_tensor)) != np.prod(eos_token_id_tensor, axis=0))
        # stop when each sentence is finished
        if unfinished_sequences.max() == 0:
            stop = True

    if len(input_ids[0]) >= max_length:
        stop = True

    # update generated ids, model inputs, and length for next step
    input_ids = np.concatenate([input_ids, next_tokens[:, None]], axis=1)

AttributeError: 'numpy.ndarray' object has no attribute 'device'