In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import transformers
from transformers import LayoutLMForTokenClassification,\
    LayoutLMTokenizer, AdamW, LayoutLMv2Processor, LayoutLMv2ForTokenClassification
# from tensordict import TensorDict
from datasets import load_dataset
import torch
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import ImageDraw, Image
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import logging
from src.preprocessing.make_dataset import ImageLayoutDataset
# from torchvision.transforms import PILToTensor

import os 
if 'notebooks' in os.getcwd():
    os.chdir("..")

In [3]:
logging.basicConfig(filename='logs/train.log', encoding='utf-8', level=logging.DEBUG)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
device

device(type='cuda')

In [6]:
dataset = load_dataset("katanaml/cord", )

# dataset = load_dataset("darentang/sroie")

Repo card metadata block was not found. Setting CardData to empty.


## Creating PyTorch Datasets, DataLoader

In [20]:
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")


In [21]:
data = []
for example in tqdm(dataset['train']):
    words = example['words']
    boxes = example['bboxes']
    image = Image.open(example['image_path']).convert("RGB")
    word_labels = example['ner_tags']

    # try:
    encoded_inputs = processor(
        image, 
        words, 
        boxes=boxes, 
        word_labels=word_labels, 
        padding="max_length", 
        truncation=True, 
        return_tensors="pt"
    )

    
    data.append(encoded_inputs)
    # except:
    #     pass

    # assert encoded_inputs.input_ids.shape == torch.Size([512])
    # assert encoded_inputs.attention_mask.shape == torch.Size([512])
    # assert encoded_inputs.token_type_ids.shape == torch.Size([512])
    # assert encoded_inputs.bbox.shape == torch.Size([512, 4])
    # assert encoded_inputs.image.shape == torch.Size([3, 224, 224])
    # assert encoded_inputs.labels.shape == torch.Size([512]) 

  0%|          | 0/800 [00:00<?, ?it/s]

100%|██████████| 800/800 [00:38<00:00, 20.53it/s]


In [26]:
df = ImageLayoutDataset(data, encode=False)

In [27]:
dataloader = DataLoader(
    df,
    shuffle=True,
    batch_size= 2
)

In [28]:
unique_rows = []
for row in dataset['train']['ner_tags']:
    unique_rows.append(np.unique(row))

In [29]:
n_labels = np.unique(np.concatenate(unique_rows)).shape[0]

In [30]:
n_labels

23

## Importing model

In [32]:

model = LayoutLMForTokenClassification.from_pretrained(
    'microsoft/layoutlm-base-uncased',
    num_labels=n_labels
)
model.to(device)

Downloading (…)lve/main/config.json: 100%|██████████| 606/606 [00:00<00:00, 111kB/s]
Downloading pytorch_model.bin: 100%|██████████| 453M/453M [00:03<00:00, 115MB/s]  
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMForTokenClassification(
  (layoutlm): LayoutLMModel(
    (embeddings): LayoutLMEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (x_position_embeddings): Embedding(1024, 768)
      (y_position_embeddings): Embedding(1024, 768)
      (h_position_embeddings): Embedding(1024, 768)
      (w_position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LayoutLMEncoder(
      (layer): ModuleList(
        (0-11): 12 x LayoutLMLayer(
          (attention): LayoutLMAttention(
            (self): LayoutLMSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
 

### Number of treinable / non-treinable parameters

In [39]:
print(f'''
    Model Info
    -----------------
    
    Treinable params: {sum(p.numel() for p in model.parameters() if p.requires_grad)}
    Non Treinable params: {sum(p.numel() for p in model.parameters() if not p.requires_grad)}

''')


    Model Info
    -----------------
    
    Treinable params: 112645655
    Non Treinable params: 0




In [40]:
optimizer = AdamW(model.parameters(), lr=5e-5)

global_step = 0
num_train_epochs = 4
t_total = len(dataloader) * num_train_epochs # total number of training steps 

#put the model in training mode
model.train()
for epoch in tqdm(range(num_train_epochs)):
  logging.info(f"Epoch: {epoch}")
  running_loss = 0.0
  correct = 0
  for X in (dataloader):
      input_ids = X["input_ids"]\
         .to(device)\
         .squeeze()
      
      bbox = X["bbox"]\
         .to(device)\
         .squeeze()

      attention_mask = X["attention_mask"]\
         .to(device)\
         .squeeze()
      token_type_ids = X["token_type_ids"]\
         .to(device)\
         .squeeze()
      
      labels = X["labels"].to(device)
      image = X["image"]\
         .to(device)\
         .squeeze()

      # forward pass
      outputs = model(
         input_ids=input_ids, 
         bbox=bbox, 
         attention_mask=attention_mask, 
         token_type_ids=token_type_ids,
         labels=labels
      )

      
      loss = outputs.loss

      running_loss += loss.item()
      predictions = outputs.logits.argmax(-1)
      correct += (predictions == labels).float().sum()

      # backward pass to get the gradients 
      loss.backward()

      # update
      optimizer.step()
      optimizer.zero_grad()
      global_step += 1
  
  logging.info(f"Loss: {running_loss / input_ids.shape[0]}")
  accuracy = 100 * correct / len(data)
  logging.info(f"Training accuracy: {accuracy.item()}", )

  0%|          | 0/4 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [42]:
model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
                      labels=labels)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [42]:
labels[1]

tensor([[-100,   22, -100,   22, -100, -100,   15,   15, -100,   15, -100, -100,
           16,   16,   16, -100, -100,   13, -100,   13, -100, -100,    3, -100,
            3,    1,    9, -100, -100,    5, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -

In [36]:
outputs.logits.argmax(-1)

tensor([[ 3,  5,  9,  ...,  3,  3,  7],
        [14, 22, 22,  ..., 14, 14,  3]], device='cuda:0')

In [25]:
outputs = model(
        input_ids=input_ids.squeeze(), 
        bbox=bbox, 
        attention_mask=attention_mask.squeeze(), 
        token_type_ids=token_type_ids.squeeze(),
        # image= image,
        labels=labels
      )

In [77]:
input_ids.shape

torch.Size([2, 1, 512])

In [78]:
bbox.shape

torch.Size([2, 512, 4])

In [79]:
attention_mask.shape

torch.Size([2, 1, 512])

In [80]:
token_type_ids.shape

torch.Size([2, 1, 512])

In [81]:

image.shape

torch.Size([2, 3, 224, 224])

In [86]:
torch.unique(labels)


tensor([-100,    1,    3,    5,    9,   10,   13,   14,   15,   16,   22])