In [1]:
pip install pdf2image pytesseract

Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Using cached pytesseract-0.3.10-py3-none-any.whl.metadata (11 kB)
Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Using cached pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.10
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

# Load the FUNSD dataset
dataset = load_dataset("nielsr/funsd")

# Split the dataset into train and validation sets
train_dataset = dataset["train"]
val_dataset = dataset["test"]


In [5]:
from transformers import LayoutLMv3Processor
from PIL import Image

# Load the LayoutLMv3 processor with apply_ocr set to False
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

# Define the preprocessing function
def preprocess_data(examples):
    # Load images
    images = [Image.open(image_path).convert("RGB") for image_path in examples["image_path"]]

    # Apply processor to images and words
    encoding = processor(images=images, text=examples["words"], boxes=examples["bboxes"], word_labels=examples["ner_tags"], truncation=True, padding="max_length")
    
    return encoding

# Apply the preprocessing function to the dataset
train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_data, batched=True, remove_columns=val_dataset.column_names)


Map:   0%|          | 0/149 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [11]:
from transformers import LayoutLMv3ForTokenClassification, TrainingArguments, Trainer
from itertools import chain

# Determine the correct label column name
label_column = "labels"  # Replace this with the correct column name based on your dataset

# Flatten the list of lists and get the unique labels
all_labels = list(chain(*train_dataset[label_column]))
num_labels = len(set(all_labels))

# Load the pre-trained LayoutLMv3 model
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=num_labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./layoutlmv3-finetuned-funsd",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start the fine-tuning
trainer.train()


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/375 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.7357823252677917, 'eval_runtime': 12.466, 'eval_samples_per_second': 4.011, 'eval_steps_per_second': 2.005, 'epoch': 1.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.6361870765686035, 'eval_runtime': 25.3564, 'eval_samples_per_second': 1.972, 'eval_steps_per_second': 0.986, 'epoch': 2.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.5590593218803406, 'eval_runtime': 23.5872, 'eval_samples_per_second': 2.12, 'eval_steps_per_second': 1.06, 'epoch': 3.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.5504087805747986, 'eval_runtime': 27.6022, 'eval_samples_per_second': 1.811, 'eval_steps_per_second': 0.906, 'epoch': 4.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.5841979384422302, 'eval_runtime': 34.3212, 'eval_samples_per_second': 1.457, 'eval_steps_per_second': 0.728, 'epoch': 5.0}
{'train_runtime': 1547.9273, 'train_samples_per_second': 0.481, 'train_steps_per_second': 0.242, 'train_loss': 0.6079742431640625, 'epoch': 5.0}


TrainOutput(global_step=375, training_loss=0.6079742431640625, metrics={'train_runtime': 1547.9273, 'train_samples_per_second': 0.481, 'train_steps_per_second': 0.242, 'total_flos': 196387740672000.0, 'train_loss': 0.6079742431640625, 'epoch': 5.0})

In [12]:
# Save the model locally
model.save_pretrained("./layoutlmv3-finetuned-funsd")
processor.save_pretrained("./layoutlmv3-finetuned-funsd")


[]

In [13]:
# Push to Hugging Face Hub
model.push_to_hub("nyati29/layoutlmv3-finetuned-funsd")
processor.push_to_hub("nyati29/layoutlmv3-finetuned-funsd")

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/nyati29/layoutlmv3-finetuned-funsd/commit/da13ac5a9840d328643103e72a445d097d38dbe6', commit_message='Upload processor', commit_description='', oid='da13ac5a9840d328643103e72a445d097d38dbe6', pr_url=None, pr_revision=None, pr_num=None)