# Document AI: Fine-tuning LayoutLM for document-parsing using Hugging Face Transformers 


In [None]:
!sudo apt install -y tesseract-ocr
!pip install pytesseract transformers datasets seqeval

In [12]:
model_id="microsoft/layoutlm-base-uncased"
processor_id="microsoft/layoutlmv2-base-uncased"
dataset_id ="nielsr/funsd"

In [113]:
from datasets import load_dataset


dataset = load_dataset(dataset_id)
dataset

Reusing dataset funsd (/home/ubuntu/.cache/huggingface/datasets/nielsr___funsd/funsd/1.0.0/8b0472b536a2dcb975d59a4fb9d6fea4e6a1abe260b7fed6f75301e168cbe595)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image_path'],
        num_rows: 149
    })
    test: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image_path'],
        num_rows: 50
    })
})

get labels

In [114]:
labels = dataset['train'].features['ner_tags'].feature.names
print(f"Available labels: {labels}")

id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}

Available labels: ['O', 'B-HEADER', 'I-HEADER', 'B-QUESTION', 'I-QUESTION', 'B-ANSWER', 'I-ANSWER']


In [115]:
from transformers import AutoTokenizer, LayoutLMForTokenClassification, LayoutLMv2Processor
import torch

# load model with correct number of labels and mapping
model = LayoutLMForTokenClassification.from_pretrained(model_id, num_labels=len(labels), label2id=label2id, id2label=id2label)

# use LayoutLMv2 processor
processor = LayoutLMv2Processor.from_pretrained(processor_id,apply_ocr=False)


Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft

In [118]:
from PIL import Image
from functools import partial
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
from torch.utils.data import DataLoader


# we need to define custom features
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(ClassLabel(names=labels)),
})

def process(sample, processor=None):
    encoding = processor(
        Image.open(sample["image_path"]).convert("RGB"),
        sample["words"],
        boxes=sample["bboxes"],
        word_labels=sample["ner_tags"],
        padding="max_length",
        truncation=True,
    )
    del encoding["image"]
    return encoding


proc_dataset = dataset.map(
    partial(process, processor=processor), remove_columns=["image_path", "words", "ner_tags", "id", "bboxes"],features=features
).with_format("torch")
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox','lables'])
print(proc_dataset["train"].features.keys())

# # create DataLoader with batch size
# train_dataloader = DataLoader(proc_dataset["train"], batch_size=4, shuffle=True)
# test_dataloader = DataLoader(proc_dataset["test"], batch_size=2)


  0%|          | 0/149 [00:00<?, ?ex/s]

  0%|          | 0/50 [00:00<?, ?ex/s]

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'bbox', 'labels'])


In [142]:
import evaluate
import numpy as np 
# Metrics
metric = evaluate.load("seqeval")

ner_labels=list(model.config.id2label.values())

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    all_predictions = []
    all_labels = []
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(ner_labels[predicted_idx])
            all_labels.append(ner_labels[label_idx])
    return metric.compute(predictions=[all_predictions], references=[all_labels])


In [147]:
from transformers import Trainer, TrainingArguments

# define training args
training_args = TrainingArguments(
    output_dir="test/",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    warmup_steps=10,
    save_total_limit=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    logging_dir="test/logs",
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="overall_f1",
    
)

# create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=proc_dataset["train"],
    eval_dataset=proc_dataset["test"],
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [148]:
trainer.train()

***** Running training *****
  Num examples = 149
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 190


Epoch,Training Loss,Validation Loss,Answer,Header,Question,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,1.198082,"{'precision': 0.6900523560209424, 'recall': 0.8145859085290482, 'f1': 0.7471655328798186, 'number': 809}","{'precision': 0.49624060150375937, 'recall': 0.5546218487394958, 'f1': 0.523809523809524, 'number': 119}","{'precision': 0.8104693140794224, 'recall': 0.8431924882629108, 'f1': 0.8265071329958583, 'number': 1065}",0.739071,0.81435,0.774887,0.808896
2,No log,1.127327,"{'precision': 0.6597724922440538, 'recall': 0.788627935723115, 'f1': 0.7184684684684686, 'number': 809}","{'precision': 0.4420289855072464, 'recall': 0.5126050420168067, 'f1': 0.47470817120622566, 'number': 119}","{'precision': 0.8003820439350525, 'recall': 0.7868544600938967, 'f1': 0.7935606060606062, 'number': 1065}",0.714219,0.771199,0.741616,0.798155
3,No log,1.313972,"{'precision': 0.7425997425997426, 'recall': 0.7132262051915945, 'f1': 0.7276166456494325, 'number': 809}","{'precision': 0.43283582089552236, 'recall': 0.48739495798319327, 'f1': 0.45849802371541504, 'number': 119}","{'precision': 0.7814451382694023, 'recall': 0.8225352112676056, 'f1': 0.8014638609332113, 'number': 1065}",0.743602,0.758154,0.750807,0.780995
4,No log,1.230214,"{'precision': 0.6932907348242812, 'recall': 0.8046971569839307, 'f1': 0.7448512585812357, 'number': 809}","{'precision': 0.4198473282442748, 'recall': 0.46218487394957986, 'f1': 0.43999999999999995, 'number': 119}","{'precision': 0.8133704735376045, 'recall': 0.8225352112676056, 'f1': 0.8179271708683474, 'number': 1065}",0.736842,0.793778,0.764251,0.802241
5,No log,1.16023,"{'precision': 0.7191011235955056, 'recall': 0.7911001236093943, 'f1': 0.7533843437316068, 'number': 809}","{'precision': 0.42953020134228187, 'recall': 0.5378151260504201, 'f1': 0.47761194029850745, 'number': 119}","{'precision': 0.8119180633147114, 'recall': 0.8187793427230047, 'f1': 0.8153342683496961, 'number': 1065}",0.745859,0.790768,0.767657,0.807728
6,No log,1.213809,"{'precision': 0.717607973421927, 'recall': 0.8009888751545118, 'f1': 0.7570093457943926, 'number': 809}","{'precision': 0.40236686390532544, 'recall': 0.5714285714285714, 'f1': 0.4722222222222222, 'number': 119}","{'precision': 0.8108614232209738, 'recall': 0.8131455399061033, 'f1': 0.8120018752930145, 'number': 1065}",0.739252,0.793778,0.765546,0.806444
7,No log,1.190648,"{'precision': 0.7051835853131749, 'recall': 0.8071693448702101, 'f1': 0.7527377521613834, 'number': 809}","{'precision': 0.4397163120567376, 'recall': 0.5210084033613446, 'f1': 0.47692307692307695, 'number': 119}","{'precision': 0.8256704980842912, 'recall': 0.8093896713615023, 'f1': 0.8174490279753437, 'number': 1065}",0.747039,0.791269,0.768519,0.809946
8,No log,1.197716,"{'precision': 0.7095032397408207, 'recall': 0.8121137206427689, 'f1': 0.7573487031700288, 'number': 809}","{'precision': 0.45, 'recall': 0.5294117647058824, 'f1': 0.48648648648648646, 'number': 119}","{'precision': 0.8297067171239356, 'recall': 0.8234741784037559, 'f1': 0.8265786993402451, 'number': 1065}",0.752237,0.801305,0.775996,0.814382
9,No log,1.205558,"{'precision': 0.7213656387665198, 'recall': 0.8096415327564895, 'f1': 0.762958648806057, 'number': 809}","{'precision': 0.45323741007194246, 'recall': 0.5294117647058824, 'f1': 0.48837209302325585, 'number': 119}","{'precision': 0.8278457196613358, 'recall': 0.8262910798122066, 'f1': 0.8270676691729324, 'number': 1065}",0.757346,0.801806,0.778942,0.81555
10,No log,1.212314,"{'precision': 0.7240618101545254, 'recall': 0.8108776266996292, 'f1': 0.7650145772594752, 'number': 809}","{'precision': 0.4583333333333333, 'recall': 0.5546218487394958, 'f1': 0.5019011406844106, 'number': 119}","{'precision': 0.8270676691729323, 'recall': 0.8262910798122066, 'f1': 0.8266791921089715, 'number': 1065}",0.757805,0.803813,0.780131,0.814032


***** Running Evaluation *****
  Num examples = 50
  Batch size = 4
Trainer is attempting to log a value of "{'precision': 0.6900523560209424, 'recall': 0.8145859085290482, 'f1': 0.7471655328798186, 'number': 809}" of type <class 'dict'> for key "eval/ANSWER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.49624060150375937, 'recall': 0.5546218487394958, 'f1': 0.523809523809524, 'number': 119}" of type <class 'dict'> for key "eval/HEADER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8104693140794224, 'recall': 0.8431924882629108, 'f1': 0.8265071329958583, 'number': 1065}" of type <class 'dict'> for key "eval/QUESTION" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to

TrainOutput(global_step=190, training_loss=0.010297049346723054, metrics={'train_runtime': 135.3717, 'train_samples_per_second': 11.007, 'train_steps_per_second': 1.404, 'total_flos': 392053072128000.0, 'train_loss': 0.010297049346723054, 'epoch': 10.0})