In [1]:
import torch
from transformers import DonutProcessor,VisionEncoderDecoderModel


model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load processor
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

# set decoder start token id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(['<s>'])[0]
model.config.pad_token_id = processor.tokenizer.pad_token_id

max_length = 512
image_size = [960, 720]

# update image_size of the encoder

# model.config.encoder.image_size = processor.feature_extractor.size[::-1] # (height, width)
model.config.encoder.image_size = image_size # (height, width)
model.config.decoder.max_length = max_length

# resize processor and model to match
processor.feature_extractor.size = image_size[::-1] # should be (width, height)
processor.feature_extractor.do_align_long_axis = False

  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


## Random sample

In [3]:
from datasets import load_dataset
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

No config specified, defaulting to: cats-image/image
Reusing dataset cats-image (/home/ubuntu/.cache/huggingface/datasets/huggingface___cats-image/image/1.9.0/68fbc793fb10cd165e490867f5d61fa366086ea40c73e549a020103dcb4f597e)
100%|██████████| 1/1 [00:00<00:00, 513.00it/s]


In [5]:
pixel_values = processor.feature_extractor(image, return_tensors="pt").pixel_values.to(device)


labels = processor.tokenizer(
    "an image of two cats chilling on a couch",
    return_tensors="pt",
).input_ids.to(device)


In [6]:
# the forward function automatically creates the correct decoder_input_ids
loss = model(pixel_values=pixel_values, labels=labels).loss

In [7]:
loss

tensor(7.5516, device='cuda:0', grad_fn=<NllLossBackward0>)

# SROIE example

In [8]:
%%bash 
# clone repository
git clone https://github.com/zzzDavid/ICDAR-2019-SROIE.git
# copy data
cp -r ICDAR-2019-SROIE/data ./
# clean up
rm -rf ICDAR-2019-SROIE
rm -rf data/box

Cloning into 'ICDAR-2019-SROIE'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Updating files: 100% (1980/1980), done.


In [9]:
import os
import json
from pathlib import Path
import shutil

# define paths
base_path = Path("data")
metadata_path = base_path.joinpath("key")
image_path = base_path.joinpath("img")
# define metadata list
metadata_list = []

# parse metadata
for file_name in metadata_path.glob("*.json"):
  with open(file_name, "r") as json_file:
    # load json file
    data = json.load(json_file)
    # create "text" column with json string
    text = json.dumps(data)
    # add to metadata list if image exists
    if image_path.joinpath(f"{file_name.stem}.jpg").is_file():    
      metadata_list.append({"text":text,"file_name":f"{file_name.stem}.jpg"})
      # delete json file
      
# write jsonline file
with open(image_path.joinpath('metadata.jsonl'), 'w') as outfile:
    for entry in metadata_list:
        json.dump(entry, outfile)
        outfile.write('\n')

# remove old meta data
shutil.rmtree(metadata_path)

In [2]:
from pathlib import Path
from datasets import load_dataset

# define paths
base_path = Path("data")
metadata_path = base_path.joinpath("key")
image_path = base_path.joinpath("img")
# Load dataset
dataset = load_dataset("imagefolder", data_dir=image_path, split="train")

print(f"Dataset has {len(dataset)} images")
print(f"Dataset features are: {dataset.features.keys()}")

Resolving data files: 100%|██████████| 627/627 [00:00<00:00, 371970.10it/s]
Using custom data configuration default-b9550808e775c2fe
Reusing dataset imagefolder (/home/ubuntu/.cache/huggingface/datasets/imagefolder/default-b9550808e775c2fe/0.0.0/0fc50c79b681877cc46b23245a6ef5333d036f48db40d53765a68034bc48faff)


Dataset has 626 images
Dataset features are: dict_keys(['image', 'text'])


In [3]:
image = dataset["image"][0]
text = dataset["text"][0]

In [4]:
processor.tokenizer.add_special_tokens({"additional_special_tokens": ['<s_total>', '</s_total>', '<s_date>', '</s_date>', '<s_company>', '</s_company>', '<s_address>', '</s_address>', '<s>', '</s>']})
model.decoder.resize_token_embeddings(len(processor.tokenizer))

Embedding(57533, 1024)

In [5]:
image = dataset["image"][45]
text = "<s><s_total>$6.90</s_total><s_date>27 MAR 2018</s_date><s_company>UNIHAKKA INTERNATIONAL SDN BHD</s_company><s_address>12, JALAN TAMPOI 7/4,KAWASAN PARINDUSTRIAN TAMPOI,81200 JOHOR BAHRU,JOHOR</s_address></s>"

In [6]:
pixel_values = processor.feature_extractor(image, return_tensors="pt").pixel_values.to(device)


input_ids = processor.tokenizer(
    text,
    return_tensors="pt",
    add_special_tokens=False,
    max_length=512,
    padding="max_length",
).input_ids.to(device)

labels = input_ids.clone()
labels[labels == processor.tokenizer.pad_token_id] = -100  # model doesn't need to predict pad token
labels = labels.to(device)

In [7]:
print(f"Pixel values: {pixel_values.shape}")
print(f"Labels: {labels.shape}")

Pixel values: torch.Size([1, 3, 960, 720])
Labels: torch.Size([1, 512])


In [9]:
labels

tensor([[    0, 57525, 38282, 55144, 39539, 43112, 57526, 57527, 10558, 35521,
         34453, 57528, 57529, 48503,  3782, 46318, 56023, 28428, 13663, 43529,
         37127, 14706, 57530, 57531, 40474, 35815, 57103, 35794, 38946, 52532,
         44977, 38100, 16908, 35815, 46318, 42252, 37877, 35181, 49023, 48540,
         36757, 53233, 38946, 52532, 44977, 35815,  8739, 46051, 24271, 18081,
          1338, 37127, 25340, 17712, 35815, 34452, 18081,  1338, 57532,     2,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

In [12]:
processor.tokenizer.decode(labels[0][labels[0]!=-100])

'<s><s_total> $6.90</s_total><s_date> 27 MAR 2018</s_date><s_company> UNIHAKKA INTERNATIONAL SDN BHD</s_company><s_address> 12, JALAN TAMPOI 7/4,KAWASAN PARINDUSTRIAN TAMPOI,81200 JOHOR BAHRU,JOHOR</s_address></s>'

In [11]:
# the forward function automatically creates the correct decoder_input_ids
loss = model(pixel_values=pixel_values, labels=labels).loss

loss

tensor(11.7751, grad_fn=<NllLossBackward0>)