In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import transformers
from transformers import LayoutLMForTokenClassification,\
    LayoutLMTokenizer, AdamW, LayoutLMv2Processor, LayoutLMv2ForTokenClassification
# from tensordict import TensorDict
from datasets import load_dataset
import torch
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import ImageDraw, Image
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import logging
# from torchvision.transforms import PILToTensor

import os 
if 'notebooks' in os.getcwd():
    os.chdir("..")

  from .autonotebook import tqdm as notebook_tqdm


## Understanding the processor 

The objective of this notebook is to analyse outputs from the processor and understand Microsoft's pre-trained tokenizer

### Importing data

In [23]:
# dataset = load_dataset("katanaml/sroie", )
dataset = load_dataset("nielsr/funsd")

Downloading builder script: 100%|██████████| 4.54k/4.54k [00:00<00:00, 8.75MB/s]
Downloading data: 100%|██████████| 16.8M/16.8M [00:02<00:00, 5.73MB/s]
Generating train split: 149 examples [00:01, 148.88 examples/s]
Generating test split: 50 examples [00:00, 175.23 examples/s]


In [246]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image_path'],
        num_rows: 149
    })
    test: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image_path'],
        num_rows: 50
    })
})

### IOB Labelling

<p align="center">
  <img src="https://stevezheng23.github.io/sequence_labeling_tf/ner.iob.example.png" alt="Sublime's custom image"/>
</p>

In [24]:
dataset['train']\
    .features['ner_tags']\
    .feature\
    .names

['O',
 'B-HEADER',
 'I-HEADER',
 'B-QUESTION',
 'I-QUESTION',
 'B-ANSWER',
 'I-ANSWER']

### Importing available processor

This processor has been pre-trained and it will be the responsable into converting words to numbers, before we pass all data to the neural network

In [9]:
processor =LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")

We can see that this processor has an image processor (we will not talk about in this notebook) and a tokenizer, which will be explained shortly

In [32]:
processor

LayoutLMv2Processor:
- image_processor: LayoutLMv2ImageProcessor {
  "apply_ocr": false,
  "do_resize": true,
  "feature_extractor_type": "LayoutLMv2FeatureExtractor",
  "image_processor_type": "LayoutLMv2ImageProcessor",
  "ocr_lang": null,
  "resample": 2,
  "size": {
    "height": 224,
    "width": 224
  },
  "tesseract_config": ""
}

- tokenizer: LayoutLMv2TokenizerFast(name_or_path='microsoft/layoutlmv2-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

### Input sizes

In [10]:
example = dataset["train"][0]

In [66]:
len(example['words'])

135

In [200]:
len(example['bboxes'])

135

In [25]:
words = example['words']
boxes = example['bboxes']
image = Image.open(example['image_path'])
word_labels = example['ner_tags']

encoded_inputs = processor(
    image, 
    words, 
    boxes=boxes, 
    word_labels=word_labels, 
    padding="max_length", 
    truncation=True, 
    return_tensors="pt"
)

In [None]:
encoded_inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image'])

## Tokenizer

In [30]:
tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.45MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 528kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 170/170 [00:00<00:00, 393kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 606/606 [00:00<00:00, 2.49MB/s]


In [49]:
tokenizer

LayoutLMTokenizer(name_or_path='microsoft/layoutlm-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

### Word pieces

In [139]:
tokenizer.tokenize("hello I am Pedro. I am a cool guy")

['hello', 'i', 'am', 'pedro', '.', 'i', 'am', 'a', 'cool', 'guy']

In [141]:
tokenizer.tokenize("hello I am Pedro. I am the coolest guy at l'X")

['hello',
 'i',
 'am',
 'pedro',
 '.',
 'i',
 'am',
 'the',
 'cool',
 '##est',
 'guy',
 'at',
 'l',
 "'",
 'x']

### Vocabulary

In [167]:
vocab = [k for (k, _) in tokenizer.vocab.items()]

In [187]:
len(vocab)

30522

In [168]:
tokenized_inputs = tokenizer("hello, I am having fun today",)

In [169]:
tokenized_inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

### Input IDS

In [None]:
encoded_inputs['input_ids'].shape

torch.Size([1, 512])

Why do we have an output of size 512 when we passed a 135-word text? 

Non zero input ids

In [None]:
(encoded_inputs['input_ids'] != 0).sum()

tensor(235)

We'll have a look at a simpler example

In [250]:
tokenized_inputs = tokenizer("hello, I am having fun today",)

In [251]:
tokenized_inputs['input_ids']

[101, 7592, 1010, 1045, 2572, 2383, 4569, 2651, 102]

In [254]:
[vocab[idx] for idx in tokenized_inputs['input_ids']]

['[CLS]', 'hello', ',', 'i', 'am', 'having', 'fun', 'today', '[SEP]']

In [148]:
tokenized_inputs = tokenizer("hello, I am having fun today",
                             padding = "max_length")

In [149]:
tokenized_inputs['input_ids']

[101,
 7592,
 1010,
 1045,
 2572,
 2383,
 4569,
 2651,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,

In [150]:
len(tokenized_inputs['input_ids'])

512

### Labels

We can also see that we had our 135 ner_tags and after the processor, we have a size of 512

In [243]:
example['ner_tags']

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 1,
 3,
 3,
 3,
 5,
 13,
 13,
 12,
 12,
 14,
 14,
 11,
 11,
 22,
 22,
 22]

In [233]:
encoded_inputs['labels']

tensor([[-100,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            3, -100,    3, -100,    3,    5, -100, -100,    1,    3, -100,    3,
         -100,    3, -100,    5, -100, -100,    1,    3, -100, -100,    3, -100,
         -100,    5, -100, -100,    1,    3,    3,    3,    5, -100, -100,    1,
            3, -100,    3, -100,    3, -100,    5, -100, -100,    1,    3,    3,
            3,    5,    1,    3,    3,    3,    5, -100, -100,    1,    3,    3,
            5, -100, -100,    1,    3,    3,    5, -100, -100,    1,    3, -100,
            3, -100,    3,    5, -100, -100,    1,    3, -100,    3, -100,    5,
         -100, -100,    1,    3, -100, -100,    3, -100,    5, -100, -100,    1,
            3, -100,    3, -100,    3, -100,    5, -100, -100, -100,    1,    3,
         -100,    3, -100,    3, -100,    5, -100, -100,    1,    3, -100,    3,
         -100,    3,    5, -

In [235]:
encoded_inputs['labels'].shape[1]

512

The processor assigns -100 to "weird" tokens (tokens that are not in the vocabulary)

In [None]:
(encoded_inputs['labels']!= -100).sum()

tensor(135)

In [None]:
len(example['ner_tags'])

135

In [259]:
encoded_inputs['input_ids'][encoded_inputs['labels'] == -100]

tensor([  101,  2072,  3126,  1010,  2199,  2243, 20142,  2072,  1010,  2199,
         7377,  3489,  2860,  2497,  1010,  2199,  1010,  2199,  2072, 14852,
         6790,  1010,  2199,  1010,  2199,  1010,  2199,  1010,  2199, 14852,
         2099,  1010,  2199,  6979,  3070,  1010,  2199,  8737,  2063,  3070,
         1010,  2199,  6979,  2953,  2378,  1010,  2199,  1012,  2072,  3070,
         2497,  1010,  2199,  2243, 24930,  2575,  1010,  2199, 14852,  2140,
         3900,  1010,  2199,  1010,  2199,  8197,  1010,  2199,  6979,  2953,
         2378,  1010,  2199, 24597,  1010,  2199,  9028,  1010,  2199,  1011,
         2561,  1010,  4090,  2575,  1010,  2199,  1010, 20317,  2497,  2487,
         1010,  6353,  2629,  3429,  1010,  5354,  2487,  1010,  5174,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [257]:
weird_entries = encoded_inputs['input_ids'][encoded_inputs['labels'] == -100]

In [258]:
[vocab[i] for i in weird_entries]

['[CLS]',
 '##i',
 '##ur',
 ',',
 '000',
 '##k',
 '##gil',
 '##i',
 ',',
 '000',
 '##sha',
 '##ke',
 '##w',
 '##b',
 ',',
 '000',
 ',',
 '000',
 '##i',
 '##yam',
 '##ata',
 ',',
 '000',
 ',',
 '000',
 ',',
 '000',
 ',',
 '000',
 '##yam',
 '##r',
 ',',
 '000',
 '##hu',
 '##ng',
 ',',
 '000',
 '##mp',
 '##e',
 '##ng',
 ',',
 '000',
 '##hu',
 '##or',
 '##in',
 ',',
 '000',
 '.',
 '##i',
 '##ng',
 '##b',
 ',',
 '000',
 '##k',
 '##gang',
 '##6',
 ',',
 '000',
 '##yam',
 '##l',
 '##ja',
 ',',
 '000',
 ',',
 '000',
 '##pi',
 ',',
 '000',
 '##hu',
 '##or',
 '##in',
 ',',
 '000',
 '##bek',
 ',',
 '000',
 '##war',
 ',',
 '000',
 '-',
 'total',
 ',',
 '34',
 '##6',
 ',',
 '000',
 ',',
 '950',
 '##b',
 '##1',
 ',',
 '69',
 '##5',
 '45',
 ',',
 '59',
 '##1',
 ',',
 '600',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',


### Token type IDS

It represents where a phrase ends and another starts

In [None]:
encoded_inputs['token_type_ids'].shape

torch.Size([1, 512])

In [None]:
(encoded_inputs['token_type_ids'] == 0).sum()

tensor(512)

In [160]:
tokenized_inputs = tokenizer('this is phrase 1', 'this is phrase 2')

In [161]:
tokenized_inputs['input_ids']

[101, 2023, 2003, 7655, 1015, 102, 2023, 2003, 7655, 1016, 102]

In [162]:
tokenized_inputs['token_type_ids']

[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

In [173]:
tokenized_inputs = tokenizer(
    'this is phrase 1', 
    'this is phrase 2', 
    padding = "max_length"
)

In [174]:
tokenized_inputs['token_type_ids']

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [175]:
len(tokenized_inputs['token_type_ids'])

512

### Attention mask

Basically it says how many tokens a sentence has related to other ones

In [176]:
tokenized_inputs = tokenizer([
    "this is a small phrase", 
    "this is a big phrase, look how I am biiiig"
    ], 
    padding=True
)

In [177]:
tokenized_inputs['attention_mask']

[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

In [178]:
tokenized_inputs = tokenizer([
    "this is a small phrase", 
    "this is a big phrase, look how I am biiiig"
    ], 
    padding="max_length"
)

In [180]:
tokenized_inputs['attention_mask'][0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [182]:
tokenized_inputs['attention_mask'][1]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


### BBoxes

In [226]:
len(example['bboxes']), encoded_inputs['bbox'].shape[1]

(135, 512)

In [219]:
example['bboxes'][0:5]

[[296, 288, 312, 300],
 [298, 310, 312, 322],
 [298, 330, 314, 342],
 [300, 351, 317, 362],
 [298, 370, 317, 382]]

In [220]:
encoded_inputs['bbox'][0][0:5]

tensor([[  0,   0,   0,   0],
        [296, 288, 312, 300],
        [298, 310, 312, 322],
        [298, 330, 314, 342],
        [300, 351, 317, 362]])

In [None]:
encoded_inputs['bbox'].shape

torch.Size([1, 512, 4])

In [None]:
encoded_inputs['bbox'][0][0]

tensor([0, 0, 0, 0])

Non zero bboxes

In [None]:
(encoded_inputs['bbox'] != torch.tensor([0,0,0,0])).sum(dim = 1)

tensor([[234, 234, 234, 234]])