In [1]:
%matplotlib widget
from transformers import VisionEncoderDecoderModel
from transformers import TrOCRProcessor
import requests
from PIL import Image
import matplotlib.pyplot as plt

# Finetuning

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

data_path = "data_original"

texts = []
paths = []
with open(os.path.join(data_path, "ascii/sentences.txt"), "r") as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith("#"):
            continue
        splits = line.split(" ")
        filename = splits[0]
        text = splits[-1]
        filegroup = filename.split("-")[0]
        filesubgroup = filegroup + "-" + filename.split("-")[1]
        filepath = os.path.join(data_path, "sentences", filegroup, filesubgroup, filename + ".png")
        text = text.replace("|", " ")
        text = text.replace("\n", "")
        texts.append(text)
        paths.append(filepath)

df = pd.DataFrame({"path": paths, "text": texts})

train_df, test_df = train_test_split(df, test_size=0.2)
# we reset the indices to start from zero
train_df = train_df[:5]
test_df = test_df[:5]
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
train_df.head()

Unnamed: 0,path,text
0,data_original/sentences/d06/d06-086/d06-086-s0...,To drift
1,data_original/sentences/n04/n04-015/n04-015-s0...,We can do with the extra money .
2,data_original/sentences/g06/g06-011f/g06-011f-...,By the end of the month he still delighted in ...
3,data_original/sentences/c06/c06-100/c06-100-s0...,"So all ends as you know it will , with the"
4,data_original/sentences/g06/g06-018i/g06-018i-...,"When the sailing season was past , he sent Pea..."


In [3]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, df, processor, max_target_length=128):
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        path = self.df['path'][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [4]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
train_dataset = IAMDataset(df=train_df,
                           processor=processor)
eval_dataset = IAMDataset(df=test_df,
                          processor=processor)

print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))

encoding = train_dataset[0]
for k,v in encoding.items():
  print(k, v.shape)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Number of training examples: 5
Number of validation examples: 5
pixel_values torch.Size([3, 384, 384])
labels torch.Size([128])


In [5]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=1)

In [6]:
from transformers import VisionEncoderDecoderModel
import torch

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
model.to(device)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [7]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [8]:
from datasets import load_metric

cer_metric = load_metric("cer")
def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return cer

  cer_metric = load_metric("cer")


In [9]:
from transformers import AdamW
from tqdm.notebook import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(10):  # loop over the dataset multiple times
   # train
   model.train()
   train_loss = 0.0
   for batch in tqdm(train_dataloader):
      # get the inputs
      for k,v in batch.items():
        batch[k] = v.to(device)

      # forward + backward + optimize
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      train_loss += loss.item()

   print(f"Loss after epoch {epoch}:", train_loss/len(train_dataloader))
    
   # evaluate
   model.eval()
   valid_cer = 0.0
   with torch.no_grad():
     for batch in tqdm(eval_dataloader):
       # run batch generation
       outputs = model.generate(batch["pixel_values"].to(device))
       # compute metrics
       cer = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
       valid_cer += cer 

   print("Validation CER:", valid_cer / len(eval_dataloader))

model.save_pretrained(".")



  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 0: 9.277596378326416


  0%|          | 0/5 [00:00<?, ?it/s]



Validation CER: 4.5310917687513435


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 1: 5.823680019378662


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 0.9841285192349023


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 2: 4.34413890838623


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 3.7586266924564797


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 3: 4.1922287940979


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 0.9298603051794541


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 4: 3.708607578277588


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 0.8219922630560929


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 5: 3.4860182285308836


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 1.0946980442725125


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 6: 3.5648303031921387


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 0.8208188265635075


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 7: 3.3525358200073243


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 0.9685858585858587


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 8: 3.3007928848266603


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 1.2821813883516011


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after epoch 9: 3.079562282562256


  0%|          | 0/5 [00:00<?, ?it/s]

Validation CER: 1.4864646464646465


In [23]:
image = train_dataset[1]["pixel_values"].unsqueeze(0).to(device)
label = train_dataset[1]["labels"].unsqueeze(0).to(device)
pred = model.generate(image)

print(label)
print(pred)
label = processor.batch_decode(label[label!=-100], skip_special_tokens=True)
pred = processor.batch_decode(pred, skip_special_tokens=True)
print("Label:", label)
print("Prediction:", pred)


tensor([[   0,  170,   64,  109,   19,    5, 1823,  418,  479,    2, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]])
tensor([[   0,    0,    0,  109,    0,  109,  109,    0,    0,   64,    0,  109,
           64,  109,    0,   64,  109,  109,  109,