## TrOCR Fine Tune


##### Install Necessary Libraries

In [1]:
%pip install -q torch torchvision torchaudio
# %pip install -q datasets jiwer

Note: you may need to restart the kernel to use updated packages.


##### Import Necessary Libraries

In [2]:
import os, sys, itertools
os.environ['TOKENIZERS_PARALLELISM']='false'
# export CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

import cv2
from PIL import Image

import torch
from torch.utils.data import Dataset

import datasets
from datasets import load_dataset

import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import VisionEncoderDecoderModel, TrOCRProcessor, default_data_collator
from datasets import load_metric


from preprocess_image import PreprocessImage
from augment_image import AugmentImage

##### Ingest, Preprocess, & Split Dataset (into Training & Testing Datasets)

In [3]:
df_train = pd.read_csv('train/DataTrain.csv', delimiter=';')
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_train = df_train.drop([126, 457, 600]) # delete the wrong labeled data
df_train.head()

Unnamed: 0,Vehicleregistrationplate,NameofFile
0,A7814,DataTrain1.png
1,B1074QO,DataTrain2.png
2,B1031QO,DataTrain3.png
3,B187EDA,DataTrain4.png
4,B1089VD,DataTrain5.png


##### Show First Samples in DataFrame

In [4]:
df_train.head(12)

Unnamed: 0,Vehicleregistrationplate,NameofFile
0,A7814,DataTrain1.png
1,B1074QO,DataTrain2.png
2,B1031QO,DataTrain3.png
3,B187EDA,DataTrain4.png
4,B1089VD,DataTrain5.png
5,B1972RBP,DataTrain6.png
6,AB2400WU,DataTrain7.png
7,AB6268YQ,DataTrain8.png
8,A8014VA,DataTrain9.png
9,B1554EJA,DataTrain10.png


##### Prepare the image and label dataset

###### Prepare Label & Images

In [5]:
label_array = df_train['Vehicleregistrationplate'].to_numpy()
path_train = df_train['NameofFile'].to_numpy()

###### Train val split

In [6]:
train_path_array, test_path_array, train_label_array, test_label_array = train_test_split(path_train, label_array, test_size=0.2, random_state=42)

###### Preprocess Test Images

In [7]:
test_image = PreprocessImage(img_paths=test_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=False)
test_img_array = test_image.img_array.copy()

Initial Preprocess: Add Padding:   0%|          | 0/160 [00:00<?, ?it/s]

Initial Preprocess: Add Padding: 100%|██████████| 160/160 [00:00<00:00, 749.63it/s]


###### Preprocess & Augment Train Images

In [8]:
train_image = PreprocessImage(img_paths=train_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=True)
# train_image_array = train_image.gray_image(masked_img=np.array([]))
train_img_array = train_image.img_array.copy()
print(train_img_array.shape)

Initial Preprocess: Add Padding: 100%|██████████| 637/637 [00:00<00:00, 759.44it/s]

(637, 384, 384, 3)





In [9]:
# train_label_array = np.concatenate((train_label_array, train_label_array), axis=0)

In [10]:
train_label_array.shape

(637,)

##### Create Dataset Class

In [11]:
class BDC_Dataset(Dataset):
    
    def __init__(self, img_array, label_array, processor, max_target_length=12):
        self.img_array = img_array
        self.label_array = label_array
        self.processor = processor
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.img_array)

    def __getitem__(self, idx):
        # get iimage + label
        image = self.img_array[idx]
        label = self.label_array[idx]
        # prepare image (i.e. resize + normalize)
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = self.processor(image, return_tensors="pt").pixel_values.to('mps:0')
        # add labels (input_ids) by encoding the label
        labels = self.processor.tokenizer(label, padding="max_length", max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id 
                  else -100 for label in labels]
        
        encoding = {"pixel_values" : pixel_values.squeeze(), "labels" : torch.tensor(labels).to('mps:0')}
        return encoding

##### Basic Values/Constants

In [12]:
MODEL_CKPT = "microsoft/trocr-base-printed"
MODEL_NAME =  MODEL_CKPT.split("/")[-1] + "_bdc_license_plates_ocrV2"
NUM_OF_EPOCHS = 2

##### Instantiate Processor, Create Training, & Testing Dataset Instances

In [13]:
processor = TrOCRProcessor.from_pretrained(MODEL_CKPT)


test_ds = BDC_Dataset(test_img_array, test_label_array, processor=processor)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


##### Print Length of Training & Testing Datasets

In [14]:
# print(f"The training dataset has {len(train_ds)} samples in it.")
print(f"The testing dataset has {len(test_ds)} samples in it.")

The testing dataset has 160 samples in it.


##### Example of Input Data Shapes

In [15]:
# encoding = train_ds[0]

# for k,v in encoding.items():
#     print(k, " : ", v.shape)

##### Show Example

In [16]:
# image = Image.open(train_ds.root_dir + train_dataset['file_name'][0]).convert("RGB")

# image

##### Show Label for Above Example

In [17]:
# labels = encoding['labels']
# labels[labels == -100] = processor.tokenizer.pad_token_id
# label_str = processor.decode(labels, skip_special_tokens=True)
# print(label_str)

#### Instantiate Model

In [18]:
from torch.utils.data import DataLoader


eval_dataloader = DataLoader(test_ds, batch_size=4)

In [19]:
model = VisionEncoderDecoderModel.from_pretrained(MODEL_CKPT)
# device = 'mps:0'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

##### Model Configuration Modifications

In [20]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 12
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

##### Define Metrics Evaluation

In [21]:
from metrics import character_accuracy_np as char_acc 

cer_metric = load_metric("cer")

def compute_metrics(pred_ids, label_ids):

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    # tmp_hat_test = [np.asarray(list(x)) for x in pred_str]
    # pred_str = np.asarray(tmp_hat_test)
    # tmp_label_test = [np.asarray(list(x)) for x in label_str]
    # label_str = np.asarray(tmp_label_test)
    # acc = char_acc(label_str, pred_str)

    return cer

  cer_metric = load_metric("cer")


In [22]:
from tqdm import tqdm
def manual_eval(model, processor, images):
    output = []
    px_vals = []
    for image in tqdm(images):
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
        px_vals.append(pixel_values)
    for pixel_values in tqdm(px_vals):
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        output.append(generated_text)
    return np.asarray(output)

In [23]:
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):  # loop over the dataset multiple times
    # Generate augmented image data for each epoch
    if epoch != 0:
        augmenter = AugmentImage(image_array=train_img_array, label_array=train_label_array, num_augmentations=1)
        train_image_array, train_lbl_array = augmenter.transform()
    else:
        train_image_array = train_img_array
        train_lbl_array = train_label_array
    train_ds = BDC_Dataset(train_image_array, train_lbl_array, processor=processor)
    train_dataloader = DataLoader(train_ds, batch_size=4, shuffle=True)
    # train
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_dataloader):
        # get the inputs
        for k,v in batch.items():
            batch[k] = v.to(device)

        # forward + backward + optimize
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    print(f"Loss after epoch {epoch}:", train_loss/len(train_dataloader))
    
    # evaluate
    model.eval()
    valid_cer = 0.0
    valid_acc = 0.0
    with torch.no_grad():
        yhat_test = manual_eval(model=model, processor=processor, images = test_img_array)
        tmp_hat_test = [np.asarray(list(x)) for x in yhat_test]
        yhat_test = np.asarray(tmp_hat_test)
        tmp_label_test = [np.asarray(list(x)) for x in test_label_array]
        ylabel_test = np.asarray(tmp_label_test)
        valid_acc = char_acc(ylabel_test, yhat_test)

        for batch in tqdm(eval_dataloader):
            # run batch generation
            outputs = model.generate(batch["pixel_values"].to(device))
            # compute metrics
            cer = compute_metrics(pred_ids=outputs, label_ids=batch["labels"])
            valid_cer += cer 
        

    print("Validation CER:", valid_cer / len(eval_dataloader))
    print("Validation Char_acc:", valid_acc)


 95%|█████████▌| 152/160 [11:26<00:36,  4.52s/it]


KeyboardInterrupt: 

In [None]:
model.save_pretrained(".")

In [None]:
yhat_test

##### Model Evaluate

In [25]:
from tqdm import tqdm
def manual_eval(model, processor, images):
    output = []
    px_vals = []
    for image in tqdm(images):
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = processor(image, return_tensors="pt").pixel_values.to('cpu')
        px_vals.append(pixel_values)
    for pixel_values in tqdm(px_vals):
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        output.append(generated_text)
    return np.asarray(output)

In [42]:
bdc_model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)


In [None]:
from metrics import character_accuracy_np

# yhat_train = manual_eval(model=bdc_model, processor=processor, images = train_image_array)
# tmp_hat = [np.asarray(list(x)) for x in yhat_train]
# yhat_train = np.asarray(tmp_hat)
# print(f"Train dataset char_acc = {character_accuracy_np(train_label_array, yhat_train)}")

yhat_test = manual_eval(model=bdc_model, processor=processor, images = test_img_array)
tmp_hat_test = [np.asarray(list(x)) for x in yhat_test]
yhat_test = np.asarray(tmp_hat_test)
tmp_label_test = [np.asarray(list(x)) for x in test_label_array]
ylabel_test = np.asarray(tmp_label_test)
print(f"test dataset char_acc = {character_accuracy_np(ylabel_test, yhat_test)}")

### Notes & Other Takeaways From This Project
****
- The results were pretty good. I was pondering whether to train for 2 or 3 epochs. Ultimately, I trained this model for 2 epochs. If this were a work project (where multiprocessing and other options are available), I would have trained for 3, if not 4, epochs.
****

### Citations

##### For Transformer Checkpoint
- @misc{li2021trocr,
      title={TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models}, 
      author={Minghao Li and Tengchao Lv and Lei Cui and Yijuan Lu and Dinei Florencio and Cha Zhang and Zhoujun Li and Furu Wei},
      year={2021},
      eprint={2109.10282},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

##### For CER Metric
- @inproceedings{morris2004,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}