In [2]:
import os
import json
import cv2 as cv
import argparse
import torch
import torch.nn as nn

import layoutlm
from layoutlm.data import FunsdDataset

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,
    RobertaConfig,
    RobertaForTokenClassification,
    RobertaTokenizer,
    get_linear_schedule_with_warmup,
)

In [23]:
BertTokenizer

transformers.tokenization_bert.BertTokenizer

In [3]:
parser = argparse.Namespace()
parser.data_dir = 'data/funsd'
parser.model_type = 'layoutlm'
parser.model_name_or_path = "layoutlm-base-uncased"
parser.do_lower_case = True
parser.max_seq_length = 512
parser.num_train_epochs = 100.0
parser.logging_steps = 10
parser.save_steps = -1
parser.output_dir = None
parser.labels = "data/funsd/labels.txt"
parser.per_gpu_train_batch_size = 16
parser.per_gpu_eval_batch_size = 16
parser.local_rank = -1
parser.overwrite_cache = True

In [4]:
def get_labels(path):
    with open(path, "r") as f:
        labels = f.read().splitlines()
    if "O" not in labels:
        labels = ["O"] + labels
    return labels

labels = get_labels(parser.labels)
# labels
pad_token_label_id = nn.CrossEntropyLoss().ignore_index
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

'[unused9]'

In [5]:
dataset = FunsdDataset(parser, tokenizer=tokenizer, labels=labels, pad_token_label_id=pad_token_label_id, mode="train")

In [6]:
# python run_seq_labeling.py  --data_dir data \
#                             --model_type layoutlm \
#                             --model_name_or_path path/to/pretrained/model/directory \
#                             --do_lower_case \
#                             --max_seq_length 512 \
#                             --do_train \
#                             --num_train_epochs 100.0 \
#                             --logging_steps 10 \
#                             --save_steps -1 \
#                             --output_dir path/to/output/directory \
#                             --labels data/labels.txt \
#                             --per_gpu_train_batch_size 16 \
#                             --per_gpu_eval_batch_size 16 \
#                             --fp16

In [14]:
input_ids, input_masks, segment_ids, label_ids, bboxes  = dataset[1]

In [15]:
idx = 100
input_ids[idx], input_masks[idx], segment_ids[idx], label_ids[idx], bboxes[idx]

In [49]:
# segment_ids
input_ids

tensor([  101,  6904,  2595,  1024,  3058,  1024,  2013,  1024,  3715,  5963,
         1024,  7799,  2194,  6904,  2595,  2053,  1012,  7928,  2053,  1024,
         2171,  1024,  2748, 13964,  1024, 19843, 26224, 26976,  2683,  2581,
         2620, 10180,  1004, 13137,  2078,  6904,  2595,  3104,  7123,  5003,
         4328,  3979,  2609,  1010,  2456,  2034,  2586,  3361,  2415,  1010,
         3263,  2148, 20377,  3540,  9654,  8459,  5631,  1010,  3516, 27533,
        21486,  1006, 20405,  1007,  4278,  1020,  5757,  3263,  2380,  3927,
         2047,  7677,  8024,  6396, 28707,  2575,  4601,  2683,  2509, 18164,
        28135,  1011, 25535,  2692,  4278,  1048,  1012,  2395,  1050,  9378,
         9328,  2669,  5887,  2456,  2629,  8698,  2475, 16798,  4261,  2487,
        24902,  2692,  3486,  2225, 11333,  9102,  3190,  1012,  6335, 24622,
        24096, 21036,  1011,  5388,  2620, 23712,  2692,  2538,  3927,  5125,
         9395,  4293,  3000,  1010,  2605,  3943,  1011,  5187, 

In [18]:
input_ids

tensor([  101,  6904,  2595,  1024,  3058,  1024,  2013,  1024,  3715,  5963,
         1024,  7799,  2194,  6904,  2595,  2053,  1012,  7928,  2053,  1024,
         2171,  1024,  2748, 13964,  1024, 19843, 26224, 26976,  2683,  2581,
         2620, 10180,  1004, 13137,  2078,  6904,  2595,  3104,  7123,  5003,
         4328,  3979,  2609,  1010,  2456,  2034,  2586,  3361,  2415,  1010,
         3263,  2148, 20377,  3540,  9654,  8459,  5631,  1010,  3516, 27533,
        21486,  1006, 20405,  1007,  4278,  1020,  5757,  3263,  2380,  3927,
         2047,  7677,  8024,  6396, 28707,  2575,  4601,  2683,  2509, 18164,
        28135,  1011, 25535,  2692,  4278,  1048,  1012,  2395,  1050,  9378,
         9328,  2669,  5887,  2456,  2629,  8698,  2475, 16798,  4261,  2487,
        24902,  2692,  3486,  2225, 11333,  9102,  3190,  1012,  6335, 24622,
        24096, 21036,  1011,  5388,  2620, 23712,  2692,  2538,  3927,  5125,
         9395,  4293,  3000,  1010,  2605,  3943,  1011,  5187, 

In [41]:
words = [tokenizer.ids_to_tokens[idx] for idx in input_ids.tolist()]
for w,b in zip(words, bboxes):
    print(f'{w} - {b.tolist()}')

[CLS] - [0, 0, 0, 0]
fa - [127, 177, 156, 192]
##x - [127, 177, 156, 192]
: - [127, 177, 156, 192]
date - [99, 337, 153, 351]
: - [99, 337, 153, 351]
from - [99, 286, 155, 301]
: - [99, 286, 155, 301]
charge - [606, 288, 721, 305]
##back - [606, 288, 721, 305]
: - [606, 288, 721, 305]
recipient - [140, 404, 233, 418]
company - [311, 402, 395, 419]
fa - [527, 404, 563, 418]
##x - [527, 404, 563, 418]
no - [564, 405, 598, 418]
. - [564, 405, 598, 418]
comments - [102, 532, 194, 546]
no - [713, 934, 745, 952]
: - [713, 934, 745, 952]
name - [342, 936, 396, 953]
: - [342, 936, 396, 953]
yes - [259, 936, 293, 951]
confirmation - [139, 937, 255, 952]
: - [139, 937, 255, 952]
207 - [909, 831, 936, 926]
##49 - [909, 831, 936, 926]
##56 - [909, 831, 936, 926]
##9 - [909, 831, 936, 926]
##7 - [909, 831, 936, 926]
##8 - [909, 831, 936, 926]
winston - [102, 95, 367, 151]
& - [374, 95, 419, 148]
straw - [424, 99, 647, 149]
##n - [424, 99, 647, 149]
fa - [681, 137, 717, 155]
##x - [681, 137, 717, 15

In [21]:
input_masks

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [20]:
label_ids

tensor([-100,   12, -100, -100,   12, -100,   12, -100,   12, -100, -100,   12,
          12,   12, -100,   12, -100,   12,   12, -100,   12, -100,   12,   11,
        -100,    9, -100, -100, -100, -100, -100,    1,    7,    4, -100,    1,
        -100,    7,    4,    9, -100,    9,    9, -100,    9,    9,    9,    9,
           9, -100,    9,    9,    9, -100, -100,    9,    9, -100,    9,    9,
        -100,    0, -100, -100,    6,    6,    3,    9,    9,    9,    9, -100,
        -100,    9,    9, -100,    9, -100, -100,    9,    9, -100, -100, -100,
           9,    9, -100,    9,    9,    9, -100, -100,    9,    9, -100,    9,
        -100,    9,    9, -100,    9, -100,    9,    9,    9, -100,    9, -100,
           9,    9, -100,    9, -100,    9, -100,    9, -100,    9,    9,    9,
           9,    9,    9, -100,    9,    9, -100,    9,    9,    9,    9,    2,
        -100,    5, -100,    0,    6, -100,    3, -100,    0,    3, -100, -100,
           0, -100,    6, -100,    3,   

In [12]:
segment_ids

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
bboxes

tensor([[  0,   0,   0,   0],
        [127, 177, 156, 192],
        [127, 177, 156, 192],
        ...,
        [  0,   0,   0,   0],
        [  0,   0,   0,   0],
        [  0,   0,   0,   0]])