# Introduction

In this notebook, adapted from UniLM's run_funsd script, itself adapted from HuggingFace's run_ner script, we are going to fine-tune the LayoutLMv2 model by Microsoft Research on the CORD dataset, which is a collection of receipts with OCR annotations. The dataset defines 30 fields under different categories ("menu", "subtotal", "total") and the task aims to label each word to the right field. The evaluation metric is entity-level F1. The notebook requires [unilm/layoutlmft](https://github.com/microsoft/unilm/tree/master/layoutlmft) to be installed.

In [1]:
import numpy as np

from datasets import load_dataset, load_metric

from layoutlmft.trainers import FunsdTrainer as Trainer

In [2]:
# to disable CUDA
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=


# Load dataset

In [3]:
datasets = load_dataset("cord.py", data_dir="../data/CORD")

Using the latest cached version of the module from /home/pierre/.cache/huggingface/modules/datasets_modules/datasets/cord/296342a377c855a0f7881c27263f451c13000db75a41155c06ffebbe96120194 (last modified on Mon Aug  2 01:26:43 2021) since it couldn't be found locally at cord.py/cord.py or remotely (FileNotFoundError).
Using custom data configuration default-data_dir=..%2Fdata%2FCORD
Reusing dataset cord (/home/pierre/.cache/huggingface/datasets/cord/default-data_dir=..%2Fdata%2FCORD/0.0.0/296342a377c855a0f7881c27263f451c13000db75a41155c06ffebbe96120194)


In [4]:
datasets["train"]

Dataset({
    features: ['id', 'tokens', 'bboxes', 'ner_tags', 'image'],
    num_rows: 800
})

In [5]:
column_names = datasets["train"].column_names
features = datasets["train"].features
text_column_name = "tokens"
label_column_name = "ner_tags"
remove_columns = column_names
label_list = features[label_column_name].feature.names
label_to_id = {i: i for i in range(len(label_list))}
num_labels = len(label_list)


# Load model and tokenizer

In [6]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
)

In [7]:
# model_name = "microsoft/layoutlmv2-base-uncased"
model_name = "../output/v2_local_cpu/"

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="ner",
    # cache_dir=model_args.cache_dir,
    # revision=model_args.model_revision,
    # use_auth_token=True if model_args.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # cache_dir=model_args.cache_dir,
    use_fast=True,
    # revision=model_args.model_revision,
    # use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    config=config,
    # cache_dir=model_args.cache_dir,
    # revision=model_args.model_revision,
    # use_auth_token=True if model_args.use_auth_token else None,
)



# Preprocess dataset

In [8]:
# Padding strategy
padding = "max_length"#False
# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        return_overflowing_tokens=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )

    labels = []
    bboxes = []
    images = []
    for batch_index in range(len(tokenized_inputs["input_ids"])):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)
        org_batch_index = tokenized_inputs["overflow_to_sample_mapping"][batch_index]

        label = examples[label_column_name][org_batch_index]
        bbox = examples["bboxes"][org_batch_index]
        image = examples["image"][org_batch_index]
        previous_word_idx = None
        label_ids = []
        bbox_inputs = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
                bbox_inputs.append([0, 0, 0, 0])
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
                bbox_inputs.append(bbox[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
                bbox_inputs.append(bbox[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
        bboxes.append(bbox_inputs)
        images.append(image)
    tokenized_inputs["labels"] = labels
    tokenized_inputs["bbox"] = bboxes
    tokenized_inputs["image"] = images
    return tokenized_inputs

train_dataset = datasets["train"]
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=remove_columns,
    num_proc=4,
    load_from_cache_file=True,
)

eval_dataset = datasets["validation"]
eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=remove_columns,
    num_proc=4,
    load_from_cache_file=True,
)

test_dataset = datasets["test"]
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=remove_columns,
    num_proc=4,
    load_from_cache_file=True,
)

#0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

#0: 100%|██████████| 1/1 [00:03<00:00,  3.01s/ba]
#1: 100%|██████████| 1/1 [00:03<00:00,  3.10s/ba]

#2: 100%|██████████| 1/1 [00:02<00:00,  2.94s/ba]


#3: 100%|██████████| 1/1 [00:02<00:00,  2.73s/ba]
#0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

#0: 100%|██████████| 1/1 [00:00<00:00,  2.26ba/s]

#2: 100%|██████████| 1/1 [00:00<00:00,  1.84ba/s]


#3: 100%|██████████| 1/1 [00:00<00:00,  1.86ba/s]
#1: 100%|██████████| 1/1 [00:00<00:00,  1.54ba/s][A

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

[A[A
#2: 100%|██████████| 1/1 [00:00<00:00,  2.07ba/s]
#0: 100%|██████████| 1/1 [00:00<00:00,  1.56ba/s]
#1: 100%|██████████| 1/1 [00:00<00:00,  1.71ba/s]


#3: 100%|██████████| 1/1 [00:00<00:00,  1.77ba/s]


# Metrics and Trainer

In [9]:
from layoutlmft.data import DataCollatorForKeyValueExtraction

In [12]:
# Data collator
data_collator = DataCollatorForKeyValueExtraction(
    tokenizer,
    pad_to_multiple_of=8,
    padding=padding,
    max_length=512,
)

# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if False:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

# Initialize our Trainer
training_args = TrainingArguments(output_dir=".", per_device_train_batch_size=1, evaluation_strategy="steps")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


# Training

In [11]:
train_result = trainer.train()
metrics = train_result.metrics
trainer.save_model()  # Saves the tokenizer too for easy upload

metrics["train_samples"] = len(train_dataset)

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

 21%|██        | 500/2400 [47:36<4:15:41,  8.07s/it]

{'loss': 2.3145, 'learning_rate': 3.958333333333333e-05, 'epoch': 0.62}


 42%|████▏     | 1000/2400 [1:46:25<3:35:52,  9.25s/it]

{'loss': 1.0954, 'learning_rate': 2.916666666666667e-05, 'epoch': 1.25}


 62%|██████▎   | 1500/2400 [2:39:44<1:16:39,  5.11s/it]

{'loss': 0.6397, 'learning_rate': 1.8750000000000002e-05, 'epoch': 1.88}


 83%|████████▎ | 2000/2400 [3:26:02<35:23,  5.31s/it]

{'loss': 0.4688, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


100%|██████████| 2400/2400 [4:03:44<00:00,  6.09s/it]


{'train_runtime': 14624.3808, 'train_samples_per_second': 0.164, 'epoch': 3.0}


# Evaluation

In [12]:
metrics = trainer.evaluate()

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 13/13 [02:32<00:00, 11.76s/it]


In [13]:
metrics

{'eval_loss': 0.404183566570282,
 'eval_precision': 0.9315831344470963,
 'eval_recall': 0.9390537289494787,
 'eval_f1': 0.9353035143769969,
 'eval_accuracy': 0.9501372369624885,
 'eval_runtime': 166.5492,
 'eval_samples_per_second': 0.6,
 'epoch': 3.0,
 'eval_samples': 100}

# Test

In [13]:
predictions, labels, metrics = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
]

trainer.log_metrics("test", metrics)
trainer.save_metrics("test", metrics)

# Save predictions
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
if trainer.is_world_process_zero():
        with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                        writer.write(" ".join(prediction) + "\n")

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
metrics

{'test_loss': 0.4203619062900543,
 'test_precision': 0.9101620029455081,
 'test_recall': 0.9251497005988024,
 'test_f1': 0.9175946547884187,
 'test_accuracy': 0.9448217317487266,
 'test_runtime': 139.6906,
 'test_samples_per_second': 0.716}

# Inference

In [10]:
import torch
from layoutlmft.data import DataCollatorForKeyValueExtraction

In [11]:
data_collator = DataCollatorForKeyValueExtraction(
    tokenizer,
    pad_to_multiple_of=8,
    padding=padding,
    max_length=512,
)

In [12]:
test_dataset[0].keys()

dict_keys(['attention_mask', 'bbox', 'image', 'input_ids', 'labels', 'overflow_to_sample_mapping', 'token_type_ids'])

In [72]:
with torch.no_grad():
    #input_ids, bbox, image, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict
    batch = data_collator([test_dataset[0]])
    input_ids = batch["input_ids"]
    bbox = batch["bbox"]
    image = batch["image"]
    att_mask = batch["attention_mask"]
    token_type_ids = batch["token_type_ids"]
    # input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0)
    # bbox = torch.tensor(batch["bbox"]).unsqueeze(0)
    # image = torch.tensor(batch["image"]).unsqueeze(0)
    # att_mask = torch.tensor(batch["attention_mask"]).unsqueeze(0)
    # token_type_ids = torch.tensor(batch["token_type_ids"]).unsqueeze(0)
    y = model(input_ids, bbox, image, att_mask, token_type_ids)

## Using Ray

In [54]:
import requests
import json

In [73]:
for k, v in batch.items():
    if isinstance(batch[k], torch.Tensor):
        batch[k] = batch[k].tolist()

batch["image"] = batch["image"][0].tolist()

In [101]:
resp = requests.post(
    "http://localhost:8000/receipt", data=json.dumps(batch)
)

resp

<Response [200]>

In [105]:
eval(resp.content)

{'class_index': [34,
  20,
  45,
  45,
  45,
  45,
  28,
  53,
  53,
  53,
  25,
  50,
  50,
  50,
  50,
  50,
  23,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  48,
  13,
  13,
  13,
  5,
  34,
  34,
  1,
  13,
  13,
  13,
  7,
  7,
  7,
  19,
  44,
  44,
  44,
  44,
  44,
  16,
  40,
  40,
  40,
  40,
  40,
  40,
  34,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
  25,
