In [1]:
from transformers import LayoutLMTokenizer, LayoutLMForTokenClassification, Trainer, TrainingArguments
from datasets import load_from_disk, Dataset, DatasetDict
from src.funsd import collect_funsd_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Add this before your dataset generation
label_list = ["header", "question", "answer", "other", "background"]  # Your actual labels
label2id = {label: idx for idx, label in enumerate(label_list)}

In [None]:
# Process and load the dataset
train_data = collect_funsd_data("data/funsd/training_data", label2id)
test_data = collect_funsd_data("data/funsd/testing_data", label2id)
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [5]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id="microsoft/layoutlm-large-uncased", local_dir="models/large")

Fetching 9 files: 100%|██████████| 9/9 [01:12<00:00,  8.02s/it]


'/home/meyert11/Documents/layoulm1-revived/models/large'

In [6]:
tokenizer = LayoutLMTokenizer.from_pretrained("models/large")
model = LayoutLMForTokenClassification.from_pretrained("models/large", 
    num_labels=len(label_list)  # Should match your actual number of classes [4]
)

Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at models/large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def preprocess(example):
    words = example["words"]
    bboxes = example["bboxes"]
    labels = example["labels"]

    tokens = []
    token_boxes = []
    token_labels = []

    for word, box, label in zip(words, bboxes, labels):
        word_tokens = tokenizer.tokenize(word)
        tokens.extend(word_tokens)
        token_boxes.extend([box] * len(word_tokens))
        token_labels.extend([label] * len(word_tokens))

    # Add special tokens ([CLS] and [SEP])
    tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
    token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
    token_labels = [-100] + token_labels + [-100]  # -100 so special tokens are ignored in loss

    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_attention_mask=True,
    )

    # Pad boxes and labels if needed
    pad_length = 512 - len(token_boxes)
    if pad_length > 0:
        token_boxes += [[0, 0, 0, 0]] * pad_length
        token_labels += [-100] * pad_length

    encoding["bbox"] = token_boxes[:512]
    encoding["labels"] = token_labels[:512]

    return encoding


In [8]:
tokenized_dataset = dataset.map(preprocess)
tokenized_dataset.save_to_disk("data/funsd_tokenized")

Map: 100%|██████████| 149/149 [00:02<00:00, 65.15 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 57.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 149/149 [00:00<00:00, 14987.56 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 50/50 [00:00<00:00, 6862.63 examples/s]


In [9]:
# Load the tokenized dataset from disk
# tokenized_dataset = load_from_disk("/home/meyert11/Documents/DocuVision/data/funsd_tokenized")

In [10]:
print("Tokenized dataset:", tokenized_dataset)
# Training arguments
training_args = TrainingArguments(
    output_dir="./finetuned-funsd",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    # eval_strategy="steps",
    fp16=True,  # RTX 3080 supports mixed precision
)

Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'bboxes', 'labels', 'image_path', 'input_ids', 'token_type_ids', 'attention_mask', 'bbox'],
        num_rows: 149
    })
    test: Dataset({
        features: ['id', 'words', 'bboxes', 'labels', 'image_path', 'input_ids', 'token_type_ids', 'attention_mask', 'bbox'],
        num_rows: 50
    })
})


In [11]:
print("Training arguments set:", training_args)
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]    
)

Training arguments set: TrainingArguments(
_n_gpu=2,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, '

In [None]:
# Train
trainer.train()



Step,Training Loss
10,1.3763
20,1.1783
30,1.1368
40,1.0782
50,0.9569
60,0.9468
70,0.9689
80,0.7915
90,0.767
100,0.7231




TrainOutput(global_step=114, training_loss=0.9608395225123355, metrics={'train_runtime': 78.4281, 'train_samples_per_second': 5.699, 'train_steps_per_second': 1.454, 'total_flos': 416577541948416.0, 'train_loss': 0.9608395225123355, 'epoch': 3.0})