In [1]:
!pip install transformers
!pip install seqeval

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 10.4MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 43.1MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 51.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB

In [2]:
import numpy as np
import pickle
import random
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive/')

#### METRICS ####
from seqeval.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score)


##### UTILS ######
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from transformers import BertConfig, BertModel, BertPreTrainedModel, get_linear_schedule_with_warmup, AdamW, BertTokenizerFast
from torch.nn import LayerNorm as BertLayerNorm

Mounted at /content/drive/


Load the data

In [3]:
## Import Data
path = '/content/drive/My Drive/layoutlm/'
train = pickle.load(open(path + 'train.pkl', 'rb'))
val = pickle.load(open(path + 'val.pkl', 'rb'))
test = pickle.load(open(path + 'test.pkl', 'rb'))

This is simply a code to see if there are some negative bounding boxes (I find some, they were close to zero, then I set them to zero)

In [4]:
for index, elem in enumerate(train[2]):
  for i,e in enumerate(elem):
    if not all(np.array(e) >= 0):
      print(e)
      print(index, i)

[-4, 426, 213, 396]
669 0
[-3, 466, 112, 435]
669 3
[-1, 529, 223, 508]
669 6


In [5]:
train[2][669][0] = [0, 426, 213, 396]
train[2][669][3] = [0, 466, 112, 435]
train[2][669][6] = [0, 529, 223, 508]

In [6]:
for index, elem in enumerate(test[2]):
  for i,e in enumerate(elem):
    if not all(np.array(e) >= 0):
      print(e)
      print(index, i)

[-3, 891, 106, 862]
88 17


In [7]:
test[2][88][17] = [0, 891, 106, 862]

# Labels Analysis

In [8]:
all_labels = [item for sublist in train[1] for item in sublist] + [item for sublist in val[1] for item in sublist] + [item for sublist in test[1] for item in sublist]

In [9]:
from collections import Counter
Counter(all_labels)

Counter({'menu.cnt': 2429,
         'menu.discountprice': 403,
         'menu.etc': 19,
         'menu.itemsubtotal': 7,
         'menu.nm': 6599,
         'menu.num': 109,
         'menu.price': 2585,
         'menu.sub_cnt': 189,
         'menu.sub_etc': 10,
         'menu.sub_nm': 822,
         'menu.sub_price': 160,
         'menu.sub_unitprice': 14,
         'menu.unitprice': 750,
         'menu.vatyn': 9,
         'sub_total.discount_price': 191,
         'sub_total.etc': 283,
         'sub_total.othersvc_price': 6,
         'sub_total.service_price': 353,
         'sub_total.subtotal_price': 1483,
         'sub_total.tax_price': 1283,
         'total.cashprice': 1397,
         'total.changeprice': 1299,
         'total.creditcardprice': 411,
         'total.emoneyprice': 129,
         'total.menuqty_cnt': 630,
         'total.menutype_cnt': 130,
         'total.total_etc': 89,
         'total.total_price': 2120,
         'void_menu.nm': 3,
         'void_menu.price': 1})

As we can see there are some labels that contains few examples, I decided to replace them by the "neutral" label "O"

In [10]:
replacing_labels = {'menu.etc': 'O', 'mneu.itemsubtotal': 'O', 'menu.sub_etc': 'O', 'menu.sub_unitprice': 'O', 'menu.vatyn': 'O',
                  'void_menu.nm': 'O', 'void_menu.price': 'O', 'sub_total.othersvc_price': 'O'}

In [11]:
def replace_elem(elem):
  try:
    return replacing_labels[elem]
  except KeyError:
    return elem
def replace_list(ls):
  return [replace_elem(elem) for elem in ls]
train[1] = [replace_list(ls) for ls in train[1]]
val[1] = [replace_list(ls) for ls in val[1]]
test[1] = [replace_list(ls) for ls in test[1]]

In [12]:
all_labels = [item for sublist in train[1] for item in sublist] + [item for sublist in val[1] for item in sublist] + [item for sublist in test[1] for item in sublist]
Counter(all_labels)

Counter({'O': 62,
         'menu.cnt': 2429,
         'menu.discountprice': 403,
         'menu.itemsubtotal': 7,
         'menu.nm': 6599,
         'menu.num': 109,
         'menu.price': 2585,
         'menu.sub_cnt': 189,
         'menu.sub_nm': 822,
         'menu.sub_price': 160,
         'menu.unitprice': 750,
         'sub_total.discount_price': 191,
         'sub_total.etc': 283,
         'sub_total.service_price': 353,
         'sub_total.subtotal_price': 1483,
         'sub_total.tax_price': 1283,
         'total.cashprice': 1397,
         'total.changeprice': 1299,
         'total.creditcardprice': 411,
         'total.emoneyprice': 129,
         'total.menuqty_cnt': 630,
         'total.menutype_cnt': 130,
         'total.total_etc': 89,
         'total.total_price': 2120})

Now we have to save all the unique labels in a list. (mandatory for the tokenclassification)

In [13]:
labels = list(set(all_labels))

# Models

This is only the class config (same as in bert (BertConfig))

In [14]:
LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_MAP = {}

LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}


class LayoutlmConfig(BertConfig):
    pretrained_config_archive_map = LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "bert"

    def __init__(self, max_2d_position_embeddings=1024, **kwargs):
        super().__init__(**kwargs)
        self.max_2d_position_embeddings = max_2d_position_embeddings

This is the NEW embedding that takes into account the position embedding, the rest is the same as in bert, we sum the token embedding, positional embedding, all the position embedding and the token type embedding

In [15]:
class LayoutlmEmbeddings(nn.Module):
    def __init__(self, config):
        super(LayoutlmEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=0
        )
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size
        )
        self.x_position_embeddings = nn.Embedding(
            config.max_2d_position_embeddings, config.hidden_size
        )
        self.y_position_embeddings = nn.Embedding(
            config.max_2d_position_embeddings, config.hidden_size
        )
        self.h_position_embeddings = nn.Embedding(
            config.max_2d_position_embeddings, config.hidden_size
        )
        self.w_position_embeddings = nn.Embedding(
            config.max_2d_position_embeddings, config.hidden_size
        )
        self.token_type_embeddings = nn.Embedding(
            config.type_vocab_size, config.hidden_size
        )

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self,
        input_ids,
        bbox,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
    ):
        seq_length = input_ids.size(1)
        if position_ids is None:
            position_ids = torch.arange(
                seq_length, dtype=torch.long, device=input_ids.device
            )
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
        upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
        right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
        lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
        h_position_embeddings = self.h_position_embeddings(
            bbox[:, :, 1] - bbox[:, :, 3]
        )
        w_position_embeddings = self.w_position_embeddings(
            bbox[:, :, 2] - bbox[:, :, 0]
        )
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = (
            words_embeddings
            + position_embeddings
            + left_position_embeddings
            + upper_position_embeddings
            + right_position_embeddings
            + lower_position_embeddings
            + h_position_embeddings
            + w_position_embeddings
            + token_type_embeddings
        )
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

This is the LayoutlmModel (equivalent to BertModel except that this one has an extra input called bbox.

In [16]:
class LayoutlmModel(BertModel):

    config_class = LayoutlmConfig
    pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "bert"

    def __init__(self, config):
        super(LayoutlmModel, self).__init__(config)
        self.embeddings = LayoutlmEmbeddings(config)
        self.init_weights()

    def forward(
        self,
        input_ids,
        bbox,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
    ):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(
            dtype=next(self.parameters()).dtype
        )  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if head_mask is not None:
            if head_mask.dim() == 1:
                head_mask = (
                    head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                )
                head_mask = head_mask.expand(
                    self.config.num_hidden_layers, -1, -1, -1, -1
                )
            elif head_mask.dim() == 2:
                head_mask = (
                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
                )  # We can specify head_mask for each layer
            head_mask = head_mask.to(
                dtype=next(self.parameters()).dtype
            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

        embedding_output = self.embeddings(
            input_ids, bbox, position_ids=position_ids, token_type_ids=token_type_ids
        )
        encoder_outputs = self.encoder(
            embedding_output, extended_attention_mask, head_mask=head_mask
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output)

        outputs = (sequence_output, pooled_output) + encoder_outputs[
            1:
        ]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)

This is the equivalent of BertForTokenClassification except that the base model is the LayoutlmModel.

In [17]:
class LayoutlmForTokenClassification(BertPreTrainedModel):
    config_class = LayoutlmConfig
    pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "bert"

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = LayoutlmModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids,
        bbox,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)

# Some aux functions

In [18]:
def set_seed(seed): ## for reproductibility
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

This is for making the structure of our data match the one needed by the model (mandatory)

In [19]:
class CordDataset(Dataset):
    def __init__(self, examples, tokenizer, labels, pad_token_label_id):
        features = convert_examples_to_features(
            examples,
            labels,
            max_seq_length,
            tokenizer,
            cls_token_at_end=False,
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False,
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=False,
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=0,
            pad_token_label_id=pad_token_label_id,
        )

        self.features = features
        # Convert to Tensors and build dataset
        self.all_input_ids = torch.tensor(
            [f.input_ids for f in features], dtype=torch.long
        )
        self.all_input_mask = torch.tensor(
            [f.input_mask for f in features], dtype=torch.long
        )
        self.all_segment_ids = torch.tensor(
            [f.segment_ids for f in features], dtype=torch.long
        )
        self.all_label_ids = torch.tensor(
            [f.label_ids for f in features], dtype=torch.long
        )
        self.all_bboxes = torch.tensor([f.boxes for f in features], dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return (
            self.all_input_ids[index],
            self.all_input_mask[index],
            self.all_segment_ids[index],
            self.all_label_ids[index],
            self.all_bboxes[index],
        )

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(
        self,
        input_ids,
        input_mask,
        segment_ids,
        label_ids,
        boxes
    ):
        assert (
            0 <= all(boxes) <= 1000
        ), "Error with input bbox ({}): the coordinate value is not between 0 and 1000".format(
            boxes
        )
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids
        self.boxes = boxes

def convert_examples_to_features(
    examples,
    label_list,
    max_seq_length,
    tokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    cls_token_box=[0, 0, 0, 0],
    sep_token_box=[1000, 1000, 1000, 1000],
    pad_token_box=[0, 0, 0, 0],
    pad_token_segment_id=0,
    pad_token_label_id=-1,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for i in range(len(examples[0])):
        width, height = 1000, 1000
        words = examples[0]
        labels = examples[1]
        boxes = examples[2]

        tokens = []
        token_boxes = []
        label_ids = []
        for word, label, box in zip(
            words[i], labels[i], boxes[i]
        ):
            if len(word) < 1: # SKIP EMPTY WORD
              continue
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            token_boxes.extend([box] * len(word_tokens))
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            label_ids.extend(
                [label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        token_boxes += [sep_token_box]
        label_ids += [pad_token_label_id]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
            token_boxes += [sep_token_box]
            label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            token_boxes += [cls_token_box]
            label_ids += [pad_token_label_id]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            token_boxes = [cls_token_box] + token_boxes
            label_ids = [pad_token_label_id] + label_ids
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = (
                [0 if mask_padding_with_zero else 1] * padding_length
            ) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
            token_boxes = ([pad_token_box] * padding_length) + token_boxes
        else:
            input_ids += [pad_token] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length
            token_boxes += [pad_token_box] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length
        assert len(token_boxes) == max_seq_length

        features.append(
            InputFeatures(
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                label_ids=label_ids,
                boxes=token_boxes,
            )
        )
    return features


This function is for the evaluation (loss, f1, precision , recall)

In [20]:
def results(preds, out_label_ids, labels, loss_):
  preds = np.argmax(preds, axis=2)

  label_map = {i: label for i, label in enumerate(labels)}

  out_label_list = [[] for _ in range(out_label_ids.shape[0])]
  preds_list = [[] for _ in range(out_label_ids.shape[0])]

  for i in range(out_label_ids.shape[0]):
      for j in range(out_label_ids.shape[1]):
          if out_label_ids[i, j] != pad_token_label_id:
              out_label_list[i].append(label_map[out_label_ids[i][j]])
              preds_list[i].append(label_map[preds[i][j]])

  results = {
      "loss": loss_,
      "precision": precision_score(out_label_list, preds_list),
      "recall": recall_score(out_label_list, preds_list),
      "f1": f1_score(out_label_list, preds_list),
  }
  return results

# Load models

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model_path = '/content/drive/My Drive/layoutlm/'
num_labels = len(labels)
config_class, model_class, tokenizer_class = LayoutlmConfig, LayoutlmForTokenClassification, BertTokenizerFast
config = config_class.from_pretrained(model_path, num_labels=num_labels)
tokenizer = tokenizer_class.from_pretrained(model_path, do_lower_case=True)
model = model_class.from_pretrained(model_path, from_tf=bool(".ckpt" in model_path), config=config)
model.to(device)

# Generate our training / validation set

In [23]:
sum_ = []
for x in train[0]:
  sum_.append(len(x))
print(max(sum_))
print(min(sum_))
print(sum(sum_)/len(sum_))

135
5
24.21375


In [23]:
max_seq_length = 150
pad_token_label_id = CrossEntropyLoss().ignore_index
train_dataset = CordDataset(train, tokenizer, labels, pad_token_label_id)
validation_dataset = CordDataset(val, tokenizer, labels, pad_token_label_id)
model_type = 'layoutlm'

In [25]:
train_batch_size = 8
learning_rate = 1e-4
adam_epsilon = 1e-8
weight_decay = 0.0
num_train_epochs = 4 ## To fine-tune (adding drop out so that It can lead to overfit less)
max_steps = 0
gradient_accumulation_steps = 1
max_grad_norm = 1.0
warmup_steps = 0
seed = 42

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=train_batch_size,
        collate_fn=None,
    )
valid_sampler = RandomSampler(validation_dataset)
valid_dataloader = DataLoader(
        validation_dataset,
        sampler=valid_sampler,
        batch_size=train_batch_size,
        collate_fn=None,
    )
if max_steps > 0:
    t_total = max_steps
    num_train_epochs = (
        args.max_steps
        // (len(train_dataloader) // gradient_accumulation_steps)
        + 1
    )
else:
    t_total = (
        len(train_dataloader)
        // gradient_accumulation_steps
        * num_train_epochs
    )
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
optimizer = AdamW(
        optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon
    )
scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )

# TRAIN

In [26]:
global_step = 0
model.zero_grad()
train_iterator = trange(int(num_train_epochs), desc="Epoch")
set_seed(seed)
for _ in train_iterator:
  #epoch_iterator = tqdm(
      #train_dataloader, desc="Iteration")
  tr_loss = 0.0
  nb_train_steps = 0
  preds_train = None
  out_label_ids = None
  for step, batch in enumerate(train_dataloader):
      model.train()
      inputs = {
          "input_ids": batch[0].to(device),
          "attention_mask": batch[1].to(device),
          "labels": batch[3].to(device),
      }
      if model_type in ["layoutlm"]:
          inputs["bbox"] = batch[4].to(device)
      inputs["token_type_ids"] = (
          batch[2].to(device) if model_type in ["bert", "layoutlm"] else None)
      outputs = model(**inputs)
      # model outputs are always tuple in pytorch-transformers (see doc)
      loss, logits = outputs
      loss.backward()

      tr_loss += loss.item()
      #if (step+1) % 25 == 0:
        #print(f"Train Epoch : {step+1}/{len(train_dataloader)}")

      if (step + 1) % gradient_accumulation_steps == 0:
          torch.nn.utils.clip_grad_norm_(
                  model.parameters(), max_grad_norm
              )
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          model.zero_grad()
          global_step += 1
      nb_train_steps += 1
      if preds_train is None:
          preds_train = logits.detach().cpu().numpy()
          out_label_ids = inputs["labels"].detach().cpu().numpy()
      else:
          preds_train = np.append(preds_train, logits.detach().cpu().numpy(), axis=0)
          out_label_ids = np.append(
              out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
  res = results(preds_train, out_label_ids, labels, tr_loss/len(train_dataloader))
  print('Train Results', res)

  ###### EVALUATION #######

  #epoch_iterator = tqdm(valid_dataloader, desc="Iteration")
  eval_loss = 0.0
  nb_eval_steps = 0
  preds_val = None
  out_label_ids = None
  model.eval()
  for step, batch in enumerate(valid_dataloader):
    with torch.no_grad():
      inputs = {
          "input_ids": batch[0].to(device),
          "attention_mask": batch[1].to(device),
          "labels": batch[3].to(device),
      }
      if model_type in ["layoutlm"]:
          inputs["bbox"] = batch[4].to(device)
      inputs["token_type_ids"] = (
          batch[2].to(device) if model_type in ["bert", "layoutlm"] else None)
      # model outputs are always tuple in pytorch-transformers (see doc)
      outputs = model(**inputs)
      tmp_eval_loss, logits = outputs[:2]
      eval_loss += tmp_eval_loss.item()
    nb_eval_steps += 1
    if preds_val is None:
      preds_val = logits.detach().cpu().numpy()
      out_label_ids = inputs["labels"].detach().cpu().numpy()
    else:
      preds_val = np.append(preds_val, logits.detach().cpu().numpy(), axis=0)
      out_label_ids = np.append(
          out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0
      )
  eval_loss = eval_loss / nb_eval_steps
  res = results(preds_val, out_label_ids, labels, eval_loss)
  print('Validation results',res)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Train Results {'loss': 0.7849363887310028, 'precision': 0.7121579183490354, 'recall': 0.7402536840141765, 'f1': 0.7259340558832944}


Epoch:  25%|██▌       | 1/4 [01:18<03:55, 78.51s/it]

Validation results {'loss': 0.33114434549441707, 'precision': 0.8516179952644041, 'recall': 0.8736842105263158, 'f1': 0.8625099920063949}
Train Results {'loss': 0.2368683361634612, 'precision': 0.8971851581063791, 'recall': 0.9155941055773177, 'f1': 0.9062961595273266}


Epoch:  50%|█████     | 2/4 [02:40<02:38, 79.45s/it]

Validation results {'loss': 0.1977280696065953, 'precision': 0.9284552845528455, 'recall': 0.9246963562753037, 'f1': 0.9265720081135903}
Train Results {'loss': 0.1343210445996374, 'precision': 0.9408180223432739, 'recall': 0.9503823913448983, 'f1': 0.9455760218995035}


Epoch:  75%|███████▌  | 3/4 [04:03<01:20, 80.63s/it]

Validation results {'loss': 0.1667864156815295, 'precision': 0.9376012965964343, 'recall': 0.9368421052631579, 'f1': 0.9372215471850952}
Train Results {'loss': 0.06489855873398483, 'precision': 0.9724003345413995, 'recall': 0.9759373251259094, 'f1': 0.9741656193269097}


Epoch: 100%|██████████| 4/4 [05:27<00:00, 82.00s/it]

Validation results {'loss': 0.17203735510030618, 'precision': 0.9577579203899269, 'recall': 0.9546558704453442, 'f1': 0.9562043795620438}





In [27]:
torch.save(model.state_dict(), '/content/drive/My Drive/layoutlm/customllm.pt')

# TEST

In [27]:
for index, e in enumerate(test[2]):
  for index_,l in enumerate(e):
    if l[1] < l[3]:
      print(index, index_)
      print(l)

54 15
[105, 726, 498, 738]


In [28]:
test[2][54][15] = [105, 738, 498, 726]

In [29]:
test_dataset = CordDataset(test, tokenizer, labels, pad_token_label_id)

In [30]:
def results_test(preds, out_label_ids, labels):
  preds = np.argmax(preds, axis=2)

  label_map = {i: label for i, label in enumerate(labels)}

  out_label_list = [[] for _ in range(out_label_ids.shape[0])]
  preds_list = [[] for _ in range(out_label_ids.shape[0])]

  for i in range(out_label_ids.shape[0]):
      for j in range(out_label_ids.shape[1]):
          if out_label_ids[i, j] != pad_token_label_id:
              out_label_list[i].append(label_map[out_label_ids[i][j]])
              preds_list[i].append(label_map[preds[i][j]])

  results = {
      "precision": precision_score(out_label_list, preds_list),
      "recall": recall_score(out_label_list, preds_list),
      "f1": f1_score(out_label_list, preds_list),
  }
  return results, classification_report(out_label_list, preds_list)

In [31]:
val_r, class_r = results_test(preds_val, out_label_ids, labels)

In [34]:
print(class_r)

                         precision    recall  f1-score   support

                enu.cnt       0.99      0.98      0.98       213
      enu.discountprice       0.75      0.75      0.75         4
                 enu.nm       0.98      0.98      0.98       221
                enu.num       0.80      1.00      0.89         4
              enu.price       0.98      0.98      0.98       222
            enu.sub_cnt       0.94      0.97      0.95        32
             enu.sub_nm       0.96      0.96      0.96        45
          enu.sub_price       0.86      0.80      0.83        15
          enu.unitprice       0.88      0.96      0.92        52
         otal.cashprice       0.95      0.94      0.95        66
       otal.changeprice       0.97      0.95      0.96        66
   otal.creditcardprice       0.88      0.93      0.90        15
       otal.emoneyprice       0.80      1.00      0.89         4
       otal.menuqty_cnt       0.85      0.88      0.87        26
      otal.menutype_cnt 

In [35]:
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(
        test_dataset,
        sampler=test_sampler,
        batch_size=2,
        collate_fn=None,
    )
nb_eval_steps = 0
preds_test = None
out_label_ids = None
model.eval()
for step, batch in enumerate(train_dataloader):
  with torch.no_grad():
    inputs = {
        "input_ids": batch[0].to(device),
        "attention_mask": batch[1].to(device),
        "labels": batch[3].to(device),
    }
    if model_type in ["layoutlm"]:
        inputs["bbox"] = batch[4].to(device)
    inputs["token_type_ids"] = (
        batch[2].to(device) if model_type in ["bert", "layoutlm"] else None)
    # model outputs are always tuple in pytorch-transformers (see doc)
    outputs = model(**inputs)
    _, logits = outputs[:2]
  if preds_test is None:
    preds_test = logits.detach().cpu().numpy()
    out_label_ids = inputs["labels"].detach().cpu().numpy()
  else:
    preds_test = np.append(preds_test, logits.detach().cpu().numpy(), axis=0)
    out_label_ids = np.append(
        out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0
    )

res, report = results_test(preds_test, out_label_ids, labels)

In [36]:
res

{'f1': 0.984280983254816,
 'precision': 0.9845105906503686,
 'recall': 0.9840514829322887}

In [38]:
print(report)

                         precision    recall  f1-score   support

                enu.cnt       0.99      0.99      0.99      1887
      enu.discountprice       0.97      0.95      0.96        82
       enu.itemsubtotal       0.00      0.00      0.00         1
                 enu.nm       0.99      1.00      0.99      2060
                enu.num       0.98      0.99      0.98        90
              enu.price       1.00      0.99      0.99      2058
            enu.sub_cnt       0.98      0.95      0.96       140
             enu.sub_nm       0.96      0.96      0.96       273
          enu.sub_price       0.95      0.88      0.91       121
          enu.unitprice       0.97      0.99      0.98       612
         otal.cashprice       0.99      0.99      0.99       527
       otal.changeprice       0.99      0.98      0.99       505
   otal.creditcardprice       0.98      0.98      0.98       119
       otal.emoneyprice       0.92      0.92      0.92        50
       otal.menuqty_cnt 