In [3]:
from datasets import load_dataset
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

No config specified, defaulting to: cats-image/image
Reusing dataset cats-image (/home/ubuntu/.cache/huggingface/datasets/huggingface___cats-image/image/1.9.0/68fbc793fb10cd165e490867f5d61fa366086ea40c73e549a020103dcb4f597e)
100%|██████████| 1/1 [00:00<00:00, 513.00it/s]


In [4]:
import torch
from transformers import DonutProcessor,VisionEncoderDecoderModel


model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load processor
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

# set decoder start token id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(['<s>'])[0]
model.config.pad_token_id = processor.tokenizer.pad_token_id

max_length = 512
image_size = [960, 720]

# update image_size of the encoder

# model.config.encoder.image_size = processor.feature_extractor.size[::-1] # (height, width)
model.config.encoder.image_size = image_size # (height, width)
model.config.decoder.max_length = max_length

# resize processor and model to match
processor.feature_extractor.size = image_size[::-1] # should be (width, height)
processor.feature_extractor.do_align_long_axis = False

In [5]:
pixel_values = processor.feature_extractor(image, return_tensors="pt").pixel_values.to(device)


labels = processor.tokenizer(
    "an image of two cats chilling on a couch",
    return_tensors="pt",
).input_ids.to(device)


In [6]:
# the forward function automatically creates the correct decoder_input_ids
loss = model(pixel_values=pixel_values, labels=labels).loss

In [7]:
loss

tensor(7.5516, device='cuda:0', grad_fn=<NllLossBackward0>)

# SROIE example

In [8]:
%%bash 
# clone repository
git clone https://github.com/zzzDavid/ICDAR-2019-SROIE.git
# copy data
cp -r ICDAR-2019-SROIE/data ./
# clean up
rm -rf ICDAR-2019-SROIE
rm -rf data/box

Cloning into 'ICDAR-2019-SROIE'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Updating files: 100% (1980/1980), done.


In [9]:
import os
import json
from pathlib import Path
import shutil

# define paths
base_path = Path("data")
metadata_path = base_path.joinpath("key")
image_path = base_path.joinpath("img")
# define metadata list
metadata_list = []

# parse metadata
for file_name in metadata_path.glob("*.json"):
  with open(file_name, "r") as json_file:
    # load json file
    data = json.load(json_file)
    # create "text" column with json string
    text = json.dumps(data)
    # add to metadata list if image exists
    if image_path.joinpath(f"{file_name.stem}.jpg").is_file():    
      metadata_list.append({"text":text,"file_name":f"{file_name.stem}.jpg"})
      # delete json file
      
# write jsonline file
with open(image_path.joinpath('metadata.jsonl'), 'w') as outfile:
    for entry in metadata_list:
        json.dump(entry, outfile)
        outfile.write('\n')

# remove old meta data
shutil.rmtree(metadata_path)

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imagefolder", data_dir=image_path, split="train")

print(f"Dataset has {len(dataset)} images")
print(f"Dataset features are: {dataset.features.keys()}")

In [11]:
image = dataset["image"][0]
text = dataset["text"][0]

In [12]:
pixel_values = processor.feature_extractor(image, return_tensors="pt").pixel_values.to(device)


labels = processor.tokenizer(
    text,
    return_tensors="pt",
).input_ids.to(device)


In [13]:
# the forward function automatically creates the correct decoder_input_ids
loss = model(pixel_values=pixel_values, labels=labels).loss

loss

tensor(3.9384, device='cuda:0', grad_fn=<NllLossBackward0>)