In [1]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install transformers[torch]
!pip install accelerate -U
!pip install -q datasets seqeval



In [1]:
import torch
from transformers import LayoutLMv3Config, LayoutLMv3Model
import pandas as pd
from datasets import load_dataset
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


# Import Dataset

In [2]:
train_image_folder = './../../dataset/SROIE2019/train/img'
train_entities_folder = './../../dataset/SROIE2019/train/entities'
train_box_folder = './../../dataset/SROIE2019/train/box'
test_image_folder = './../../dataset/SROIE2019/test/img'
test_entities_folder = './../../dataset/SROIE2019/test/entities'
test_box_folder = './../../dataset/SROIE2019/test/box'

In [3]:
dataset = load_dataset("./dataset.py", trust_remote_code=True)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'bboxes', 'ner_tags', 'image'],
        num_rows: 626
    })
    test: Dataset({
        features: ['id', 'tokens', 'bboxes', 'ner_tags', 'image'],
        num_rows: 347
    })
})

In [5]:
dataset['train']['bboxes']

[[[72, 25, 326, 64],
  [50, 82, 440, 121],
  [205, 121, 285, 139],
  [110, 144, 383, 163],
  [192, 169, 299, 187],
  [162, 193, 334, 211],
  [217, 216, 275, 233],
  [50, 342, 279, 359],
  [50, 372, 96, 390],
  [165, 372, 342, 389],
  [48, 396, 117, 415],
  [164, 397, 215, 413],
  [49, 423, 122, 440],
  [191, 460, 298, 476],
  [30, 508, 121, 523],
  [200, 507, 247, 521],
  [276, 506, 306, 522],
  [374, 507, 441, 521],
  [69, 531, 102, 550],
  [221, 531, 247, 545],
  [420, 529, 443, 547],
  [27, 570, 137, 583],
  [159, 570, 396, 584],
  [77, 598, 113, 613],
  [138, 597, 148, 607],
  [202, 597, 245, 612],
  [275, 598, 309, 612],
  [411, 596, 443, 613],
  [245, 639, 293, 658],
  [118, 671, 291, 687],
  [408, 669, 443, 684],
  [86, 704, 292, 723],
  [401, 703, 443, 719],
  [205, 744, 243, 765],
  [402, 748, 441, 763],
  [205, 770, 271, 788],
  [412, 772, 443, 786],
  [97, 845, 401, 860],
  [190, 864, 309, 880],
  [142, 883, 353, 901],
  [137, 903, 351, 920],
  [202, 942, 292, 959],
  [163, 

In [6]:
dataset['train'].features

{'id': Value(dtype='int32', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'bboxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'company', 'date', 'address', 'total'], id=None), length=-1, id=None),
 'image': Image(mode=None, decode=True, id=None)}

In [7]:
example = dataset['train'][0]

In [8]:
words, boxes, ner_tags = example["tokens"], example["bboxes"], example["ner_tags"]
print(words)
print(boxes)
print(ner_tags)

['TAN WOON YANN', 'BOOK TA .K(TAMAN DAYA) SDN BND', '789417-W', 'NO.53 55, 57 & 59,  JALAN SAGU 18, ', 'TAMAN DAYA, ', '81100 JOHOR BAHRU, ', 'JOHOR.', 'DOCUMENT NO : TD01167104', 'DATE:', '25/12/2018 8:13:39 PM', 'CASHIER:', 'MANIS', 'MEMBER:', 'CASH BILL', 'CODE/DESC', 'PRICE', 'DISC', 'AMOUNT', 'QTY', 'RM', 'RM', '9556939040116', 'KF MODELLING CLAY KIDDY FISH', '1 PC', '*', '9.000', '0.00', '9.00', 'TOTAL:', 'ROUR DING ADJUSTMENT:', '0.00', 'ROUND D TOTAL (RM):', '9.00', 'CASH', '10.00', 'CHANGE', '1.00', 'GOODS SOLD ARE NOT RETURNABLE OR', 'EXCHANGEABLE', '***', '***', 'THANK YOU', 'PLEASE COME AGAIN !', '9.00']
[[72, 25, 326, 64], [50, 82, 440, 121], [205, 121, 285, 139], [110, 144, 383, 163], [192, 169, 299, 187], [162, 193, 334, 211], [217, 216, 275, 233], [50, 342, 279, 359], [50, 372, 96, 390], [165, 372, 342, 389], [48, 396, 117, 415], [164, 397, 215, 413], [49, 423, 122, 440], [191, 460, 298, 476], [30, 508, 121, 523], [200, 507, 247, 521], [276, 506, 306, 522], [374, 507, 4

# Prepare Dataset

In [9]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
processor

2024-06-27 16:42:15.311065: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-27 16:42:18.225565: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


LayoutLMv3Processor:
- image_processor: LayoutLMv3ImageProcessor {
  "apply_ocr": false,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "LayoutLMv3ImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "ocr_lang": null,
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  },
  "tesseract_config": ""
}

- tokenizer: LayoutLMv3TokenizerFast(name_or_path='microsoft/layoutlmv3-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rs

In [10]:
features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

In [11]:
labels = features[label_column_name].feature.names
labels

['O', 'company', 'date', 'address', 'total']

In [12]:
id2label = {k: v for k,v in enumerate(labels)}
label2id = {v: k for k,v in enumerate(labels)}

In [13]:
num_labels = len(labels)

In [14]:
# to prepare encodings of each example

def prepare_examples(example):
    image = example['image']
    sentences = example['tokens']
    bbox = example['bboxes']
    labels = example['ner_tags']

    encoding = processor(image, sentences, boxes=bbox, word_labels=labels, truncation = True, padding='max_length')
    
    return encoding

In [15]:
a = prepare_examples(example)
a

{'input_ids': [0, 255, 1889, 305, 673, 2191, 854, 15118, 39854, 16667, 479, 530, 1640, 565, 2620, 1889, 16995, 250, 43, 12723, 487, 163, 13457, 262, 5046, 36711, 12, 771, 8228, 4, 4540, 3490, 6, 4981, 359, 5169, 6, 1437, 344, 2118, 1889, 208, 3450, 791, 504, 6, 1437, 38656, 1889, 16995, 250, 6, 1437, 290, 40830, 344, 11979, 3411, 19399, 16271, 791, 6, 1437, 344, 11979, 3411, 4, 34330, 5725, 5382, 8228, 4832, 7463, 2663, 25540, 17573, 211, 8625, 35, 564, 73, 1092, 73, 2464, 290, 35, 1558, 35, 3416, 2784, 230, 13246, 35875, 35, 12408, 1729, 256, 26653, 35, 230, 13246, 163, 10259, 37604, 73, 40429, 347, 4729, 9292, 15421, 347, 3326, 34494, 1209, 6175, 16124, 16124, 361, 3118, 4563, 3416, 3387, 2663, 1549, 229, 597, 256, 38023, 6006, 1862, 5289, 2547, 229, 2688, 495, 975, 274, 14849, 112, 4985, 1009, 361, 4, 151, 321, 4, 612, 361, 4, 612, 36575, 35, 248, 14257, 211, 1862, 4516, 43845, 12613, 35, 321, 4, 612, 248, 18103, 211, 36575, 36, 28580, 3256, 361, 4, 612, 230, 13246, 158, 4, 612, 385

In [16]:
a.keys()

dict_keys(['input_ids', 'attention_mask', 'bbox', 'labels', 'pixel_values'])

In [17]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

features=Features(
   {
        'input_ids': Sequence(feature=Value(dtype='int64')),
        'attention_mask': Sequence(Value(dtype='int64')),
        'bbox': Array2D(dtype="int64", shape=(512, 4)),
        'labels': Sequence(feature=Value(dtype='int64')),
        'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
   }
)

In [18]:
small_train_dataset = dataset['train'].select(range(100)).map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features
)

In [20]:
train_dataset = dataset['train'].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
    batch_size=32
)

Map: 100%|███████████████████████████████████████████████████████████| 626/626 [00:50<00:00, 12.50 examples/s]


In [None]:
test_dataset = dataset['test'].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
    batch_size=32
)

Map:  46%|███████████████████████████▏                               | 160/347 [00:05<00:05, 31.74 examples/s]