In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import transformers
import pandas as pd
from transformers import LayoutLMForTokenClassification
from transformers import LayoutLMForSequenceClassification
from transformers import LayoutLMTokenizer
from transformers import AdamW
from datasets import load_dataset
from datasets import Dataset
from datasets import Features, Sequence, ClassLabel, Value, Array2D
import pytesseract
import torch
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import ImageDraw, Image
import numpy as np

import os 
if 'notebooks' in os.getcwd():
    os.chdir("..")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

In [5]:
dataset = load_dataset("katanaml/cord", )

Repo card metadata block was not found. Setting CardData to empty.


## Creating PyTorch Datasets, DataLoader

In [6]:
example = dataset['train'][0]

In [7]:
example.keys()

dict_keys(['id', 'words', 'bboxes', 'ner_tags', 'image_path'])

In [8]:
dataset_pd = pd.DataFrame.from_dict(dataset['train'])

In [9]:
dataset_hf = Dataset.from_pandas(dataset_pd)

In [10]:
n_labels = 23
model = LayoutLMForTokenClassification.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    num_labels = n_labels
)
model.to(device)

Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMForTokenClassification(
  (layoutlm): LayoutLMModel(
    (embeddings): LayoutLMEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (x_position_embeddings): Embedding(1024, 768)
      (y_position_embeddings): Embedding(1024, 768)
      (h_position_embeddings): Embedding(1024, 768)
      (w_position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LayoutLMEncoder(
      (layer): ModuleList(
        (0-11): 12 x LayoutLMLayer(
          (attention): LayoutLMAttention(
            (self): LayoutLMSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
 

In [14]:
tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")

In [15]:
def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
  words = example['words']
  normalized_word_boxes = example['bboxes']

  assert len(words) == len(normalized_word_boxes)

  token_boxes = []
  for word, box in zip(words, normalized_word_boxes):
      word_tokens = tokenizer.tokenize(word)
      token_boxes.extend([box] * len(word_tokens))
  
  # Truncation of token_boxes
  special_tokens_count = 2 
  if len(token_boxes) > max_seq_length - special_tokens_count:
      token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]
  
  # add bounding boxes of cls + sep tokens
  token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
  
  encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
  # Padding of token_boxes up the bounding boxes to the sequence length.
  input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
  padding_length = max_seq_length - len(input_ids)
  token_boxes += [pad_token_box] * padding_length
  encoding['bboxes'] = token_boxes
  encoding['label'] = example['ner_tags']

  assert len(encoding['input_ids']) == max_seq_length
  assert len(encoding['attention_mask']) == max_seq_length
  assert len(encoding['token_type_ids']) == max_seq_length
  assert len(encoding['bboxes']) == max_seq_length

  return encoding

In [16]:
encode_example(example=example)

{'input_ids': [101, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1060, 1015, 17235, 2072, 3409, 3126, 20222, 4293, 1010, 2199, 1015, 22861, 2243, 3841, 20142, 17235, 2072, 8732, 1010, 2199, 1015, 6501, 7377, 3489, 2732, 2860, 2497, 4261, 1010, 2199, 1015, 3256, 14380, 5572, 2484, 1010, 2199, 1015, 17235, 2072, 1037, 14852, 24903, 6790, 3963, 1010, 2199, 1017, 2489, 3256, 5572, 1014, 1015, 7554, 2665, 7842, 3515, 1010, 2199, 1015, 3256, 5572, 2324, 1010, 2199, 1015, 3256, 4589, 2756, 1010, 2199, 1015, 1037, 14852, 24086, 2099, 20222, 5594, 1010, 2199, 1016, 11937, 6979, 13638, 3070, 4029, 1010, 2199, 1016, 8915, 8737, 2063, 13638, 3070, 4029, 1010, 2199, 1015, 11937, 6979, 10093, 2953, 2004, 2378, 2871, 1010, 2199, 1012, 1015, 17235, 2072, 13638, 3070, 3520, 2497, 3963, 1010, 2199, 1017, 22861, 2243, 20657, 24930, 3520, 4029, 2575, 1010, 2199, 1015, 1037, 14852, 29086, 2140, 7632, 3900, 6227, 1010, 2199, 10

In [17]:
features = Features({
    'id': Sequence(Value(dtype='int64')),
    'words': Sequence(Value(dtype="int64")),
    'bboxes': Sequence(Sequence(Value(dtype='int64'))),
    'ner_tags': Sequence(Value(dtype='int64')),
    'image_path': Value(dtype='string')
})

dataset_encoded = dataset_hf.map(lambda example : encode_example(example),
                                 features = dataset_hf.features)

Map: 100%|██████████| 800/800 [00:02<00:00, 361.27 examples/s]


KeyError: 'input_ids'

In [None]:
features_list = []
for k in dataset_hf.features.keys():
    features_list.append(k)

In [None]:
dataset_hf.set_format(type='torch', columns=features_list)

In [None]:
dataloader = torch.utils.data.DataLoader(dataset_hf, batch_size=1, shuffle=True)
batch = next(iter(dataloader))
batch

In [None]:
model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=23)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

global_step = 0
num_train_epochs = 30
t_total = len(dataloader) * num_train_epochs # total number of training steps 

#put the model in training mode
model.train()
for epoch in range(num_train_epochs):
  print("Epoch:", epoch)
  running_loss = 0.0
  correct = 0
  for batch in dataloader:
      id = batch["id"].to(device)
      words = batch["words"].to(device)
      bboxes = batch["bboxes"].to(device)
      ner_tags = batch["ner_tags"].to(device)

      # forward pass
      outputs = model(id=id, words=words, bboxes=bboxes,ner_tags=ner_tags)
      loss = outputs.loss

      running_loss += loss.item()
      predictions = outputs.logits.argmax(-1)
      correct += (predictions == dataset['train']['ner_tags']).float().sum()

      # backward pass to get the gradients 
      loss.backward()

      # update
      optimizer.step()
      optimizer.zero_grad()
      global_step += 1
  
  print("Loss:", running_loss / batch["id"].shape[0])
  accuracy = 100 * correct / len(dataset)
  print("Training accuracy:", accuracy.item())

## Importing model

In [6]:
n_labels = 8
model = LayoutLMForTokenClassification.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    num_labels = n_labels
)
model.to(device)

Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMForTokenClassification(
  (layoutlm): LayoutLMModel(
    (embeddings): LayoutLMEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (x_position_embeddings): Embedding(1024, 768)
      (y_position_embeddings): Embedding(1024, 768)
      (h_position_embeddings): Embedding(1024, 768)
      (w_position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LayoutLMEncoder(
      (layer): ModuleList(
        (0-11): 12 x LayoutLMLayer(
          (attention): LayoutLMAttention(
            (self): LayoutLMSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
 

### Number of treinable / non-treinable parameters

In [7]:
print(f'''
    Model Info
    -----------------
    
    Treinable params: {sum(p.numel() for p in model.parameters() if p.requires_grad)}
    Non Treinable params: {sum(p.numel() for p in model.parameters() if not p.requires_grad)}

''')


    Model Info
    -----------------
    
    Treinable params: 112634120
    Non Treinable params: 0


