## TrOCR Fine Tune


##### Install Necessary Libraries

In [1]:
# %pip install -q torch torchvision torchaudio
# %pip install -q datasets jiwer

##### Import Necessary Libraries

In [2]:
import os, sys, itertools
os.environ['TOKENIZERS_PARALLELISM']='false'
# export CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

import cv2
from PIL import Image

import torch
from torch.utils.data import Dataset

import datasets
from datasets import load_dataset

import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import VisionEncoderDecoderModel, TrOCRProcessor, default_data_collator
from datasets import load_metric


from preprocess_image import PreprocessImage
from augment_image import AugmentImage

##### Ingest, Preprocess, & Split Dataset (into Training & Testing Datasets)

In [3]:
df_train = pd.read_csv('train/DataTrain.csv', delimiter=';')
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_train = df_train.drop([126, 457, 600]) # delete the wrong labeled data
df_train.head()

Unnamed: 0,Vehicleregistrationplate,NameofFile
0,A7814,DataTrain1.png
1,B1074QO,DataTrain2.png
2,B1031QO,DataTrain3.png
3,B187EDA,DataTrain4.png
4,B1089VD,DataTrain5.png


##### Show First Samples in DataFrame

In [4]:
df_train.head(12)

Unnamed: 0,Vehicleregistrationplate,NameofFile
0,A7814,DataTrain1.png
1,B1074QO,DataTrain2.png
2,B1031QO,DataTrain3.png
3,B187EDA,DataTrain4.png
4,B1089VD,DataTrain5.png
5,B1972RBP,DataTrain6.png
6,AB2400WU,DataTrain7.png
7,AB6268YQ,DataTrain8.png
8,A8014VA,DataTrain9.png
9,B1554EJA,DataTrain10.png


##### Prepare the image and label dataset

###### Prepare Label & Images

In [5]:
label_array = df_train['Vehicleregistrationplate'].to_numpy()
path_train = df_train['NameofFile'].to_numpy()

###### Train val split

In [6]:
train_path_array, test_path_array, train_label_array, test_label_array = train_test_split(path_train, label_array, test_size=0.2, random_state=42)

###### Preprocess Test Images

In [7]:
test_image = PreprocessImage(img_paths=test_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=False)
test_img_array = test_image.img_array.copy()

Initial Preprocess: Add Padding: 100%|██████████| 160/160 [00:00<00:00, 538.31it/s]


###### Preprocess & Augment Train Images

In [8]:
# train_label_array = np.concatenate((train_label_array, train_label_array), axis=0)

In [9]:
train_label_array.shape

(637,)

In [10]:
MAX_LENGTH = 12
DEVICE = 'mps:0'
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Create Dataset Class

In [11]:
class BDC_Dataset(Dataset):
    
    def __init__(self, img_array, label_array, processor, max_target_length=MAX_LENGTH):
        self.img_array = img_array
        self.label_array = label_array
        self.processor = processor
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.img_array)

    def __getitem__(self, idx):
        # get iimage + label
        image = self.img_array[idx]
        label = self.label_array[idx]
        # prepare image (i.e. resize + normalize)
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = self.processor(image, return_tensors="pt").pixel_values.to(DEVICE)
        # add labels (input_ids) by encoding the label
        labels = self.processor.tokenizer(label, padding="max_length", max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id 
                  else -100 for label in labels]
        
        encoding = {"pixel_values" : pixel_values.squeeze(), "labels" : torch.tensor(labels).to(DEVICE)}
        return encoding

##### Basic Values/Constants

In [12]:
MODEL_CKPT = "microsoft/trocr-base-printed"
MODEL_NAME =  MODEL_CKPT.split("/")[-1] + "_bdc_license_plates_ocrV3"
NUM_OF_EPOCHS = 2
BATCH_SIZE = 2

##### Instantiate Processor, Create Training, & Testing Dataset Instances

In [13]:
processor = TrOCRProcessor.from_pretrained(MODEL_CKPT)


test_ds = BDC_Dataset(test_img_array, test_label_array, processor=processor)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


##### Print Length of Training & Testing Datasets

In [14]:
# print(f"The training dataset has {len(train_ds)} samples in it.")
print(f"The testing dataset has {len(test_ds)} samples in it.")

The testing dataset has 160 samples in it.


##### Example of Input Data Shapes

In [15]:
# encoding = train_ds[0]

# for k,v in encoding.items():
#     print(k, " : ", v.shape)

##### Show Example

In [16]:
# image = Image.open(train_ds.root_dir + train_dataset['file_name'][0]).convert("RGB")

# image

##### Show Label for Above Example

In [17]:
# labels = encoding['labels']
# labels[labels == -100] = processor.tokenizer.pad_token_id
# label_str = processor.decode(labels, skip_special_tokens=True)
# print(label_str)

#### Instantiate Model

In [18]:
from torch.utils.data import DataLoader


eval_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [19]:
model = VisionEncoderDecoderModel.from_pretrained(MODEL_CKPT)


model.to(DEVICE)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

##### Model Configuration Modifications

In [20]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = MAX_LENGTH
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

##### Define Metrics Evaluation

In [21]:
from metrics import character_accuracy_np as char_acc 

cer_metric = load_metric("cer")

def compute_metrics(pred_ids, label_ids):

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    # tmp_hat_test = [np.asarray(list(x)) for x in pred_str]
    # pred_str = np.asarray(tmp_hat_test)
    # tmp_label_test = [np.asarray(list(x)) for x in label_str]
    # label_str = np.asarray(tmp_label_test)
    # acc = char_acc(label_str, pred_str)

    return cer

  cer_metric = load_metric("cer")


In [22]:
from tqdm import tqdm
def manual_eval(model, processor, images):
    output = []
    px_vals = []
    for image in tqdm(images):
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = processor(image, return_tensors="pt").pixel_values.to(DEVICE)
        px_vals.append(pixel_values)
    for pixel_values in tqdm(px_vals):
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        generated_text = generated_text.replace(" ", "") # Remove spacing
        output.append(generated_text)
    return np.asarray(output)

In [23]:
masked_img = {'bw_image': np.array([10, 41, 23, 15, 16, 7, 28, 81, 73, 83, 59, 89, 100, 112, 148, 152, 214, 250, 292, 309, 385, 410, 434, 419, 446, 490, 464, 505, 566]),
              'gray_image': np.array([521]),
              'segment_number': np.array([2, 6, 7, 8, 9, 15, 16, 28, 29, 39, 37, 33, 42, 23, 21, 20, 80, 81, 82, 83, 74, 71, 54, 69, 59, 77, 79, 89, 99, 96, 85, 104, 105, 106, 114, 115, 116, 117, 119, 124, 125, 128, 130, 138, 139, 148, 149, 152, 153, 154, 156, 158, 159, 165, 167, 177, 180, 182, 197, 201, 202, 203, 204, 214, 216, 220, 223, 227, 235, 239, 243, 247, 248, 250, 251, 252, 258, 262, 265, 271, 273, 279, 288, 292, 294, 297, 299, 310, 307, 313, 315, 319, 320, 321, 323, 324, 330, 331, 332, 333, 335, 343, 345, 348, 350, 357, 360, 363, 364, 369, 370, 377, 381, 383, 384, 385, 389, 396, 398, 399, 402, 416, 419, 422, 426, 428, 429, 434, 440, 442, 450, 458, 459, 460, 461, 462, 464, 466, 469, 473, 478, 479, 486, 490, 505, 506, 521, 528, 529, 531, 539, 550, 551, 553, 557, 563, 566, 568, 579, 589, 585, 592, 594, 597, 595, 599, 604, 606, 608, 629, 628]),
              }
train_image = PreprocessImage(img_paths=train_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=False)
# train_image_array = train_image.gray_image(masked_img=np.array([]))
original = train_image.img_array.copy()
original_label = train_label_array.copy()
print(f'Original image shape= {original.shape}, label= {original_label.shape}')
bw_image = train_image.bw_image(masked_img['bw_image'])
bw_image_label = train_label_array.copy()
bw_image_label = np.delete(bw_image_label, masked_img['bw_image'])
print(f'bw_image image shape= {bw_image.shape}, label= {bw_image_label.shape}')
gray_image = train_image.gray_image(masked_img['gray_image'])
gray_image_label = train_label_array.copy()
gray_image_label = np.delete(gray_image_label, masked_img['gray_image'])
print(f'gray_image image shape= {gray_image.shape}, label= {gray_image_label.shape}')
segment_number = train_image.segment_number_image(masked_img['segment_number'])
segment_number_label = train_label_array.copy()
segment_number_label = np.delete(segment_number_label, masked_img['segment_number'])
print(f'segment_number image shape= {segment_number.shape}, label= {segment_number_label.shape}')


Initial Preprocess: Add Padding: 100%|██████████| 637/637 [00:00<00:00, 777.35it/s]


Original image shape= (637, 384, 384, 3), label= (637,)


Preprocess: Convert to BW & High Contrast Image: 100%|██████████| 637/637 [00:00<00:00, 4252.54it/s]


bw_image image shape= (608, 384, 384, 3), label= (608,)


Preprocess: Convert to Grayscale Image: 100%|██████████| 637/637 [00:00<00:00, 7503.02it/s]


gray_image image shape= (636, 384, 384, 3), label= (636,)


Preprocess: Segment Number Image: 100%|██████████| 637/637 [00:00<00:00, 733.27it/s]

segment_number image shape= (466, 384, 384, 3), label= (466,)





In [24]:
train_image_list = [bw_image, gray_image, segment_number]
train_label_list = [bw_image_label, gray_image_label, segment_number_label]

In [26]:
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

max_epoch = 2

for epoch in range(max_epoch):  # loop over the dataset multiple times
    
    # Generate augmented image data for each epoch
    if epoch>=0:
        augmenter = AugmentImage(image_array=original, label_array=original_label, num_augmentations=1)
        augment_original, augment_original_label = augmenter.transform()
        # new_train_image_list = train_image_list.copy()
        # new_train_label_list = train_label_list.copy()
        # for i in range(len(train_image_list)):
        #     augmenter = AugmentImage(image_array=train_image_list[i], label_array=train_label_list[i], num_augmentations=2+epoch)
        #     tmp_array, tmp_array_label = augmenter.transform()
        #     random_indices = np.random.choice(train_image_list[i].shape[0], size=train_image_list[i].shape[0]//(2*(epoch+2)), replace=False)
        #     new_train_image_list[i] = tmp_array[random_indices]
        #     new_train_label_list[i] = tmp_array_label[random_indices]
        # new_train_image_list.append(augment_original)
        # new_train_label_list.append(augment_original_label)
        # train_image_array = np.concatenate(new_train_image_list, axis=0)
        # train_lbl_array = np.concatenate(new_train_label_list, axis=0)
        # augmenter = AugmentImage(image_array=train_img_array, label_array=train_label_array, num_augmentations=2)
        # train_image_array, train_lbl_array = augmenter.transform()
    # elif epoch < 3:
    #     train_img_array = np.concatenate((train_image_list[epoch], original), axis=0)
    #     train_label_array = np.concatenate((train_label_list[epoch], original_label), axis=0)
    #     augmenter = AugmentImage(image_array=train_img_array, label_array=train_label_array, num_augmentations=2)
    #     train_image_array, train_lbl_array = augmenter.transform()
    # else:
    #     augmenter = AugmentImage(image_array=original, label_array=original_label, num_augmentations=2)
    #     train_image_array, train_lbl_array = augmenter.transform()
    train_ds = BDC_Dataset(augment_original, augment_original_label, processor=processor)
    train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    # train
    model.train()
    train_loss = 0.0
    for idx, batch in enumerate(tqdm(train_dataloader, desc=f'Epoch: {epoch+1}/{max_epoch}')):
        # get the inputs
        for k,v in batch.items():
            batch[k] = v.to(DEVICE)

        # forward + backward + optimize
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        
        if idx % (len(train_dataloader)//10) == 0 and idx != 0:
            print(f"Loss after batch {idx}:", train_loss/idx)

    print(f"Loss after epoch {epoch}:", train_loss/len(train_dataloader))
    
    # evaluate
    model.eval()
    valid_cer = 0.0
    valid_acc = 0.0
    with torch.no_grad():
        yhat_test = manual_eval(model=model, processor=processor, images = test_img_array)
        tmp_hat_test = [np.asarray(list(x)) for x in yhat_test]
        yhat_test = np.asarray(tmp_hat_test)
        tmp_label_test = [np.asarray(list(x)) for x in test_label_array]
        ylabel_test = np.asarray(tmp_label_test)
        valid_acc = char_acc(ylabel_test, yhat_test)

        for batch in tqdm(eval_dataloader):
            # run batch generation
            outputs = model.generate(batch["pixel_values"].to(DEVICE))
            # compute metrics
            cer = compute_metrics(pred_ids=outputs, label_ids=batch["labels"])
            valid_cer += cer 
        

    print("Validation CER:", valid_cer / len(eval_dataloader))
    print("Validation Char_acc:", valid_acc)


Augement Images: 100%|██████████| 637/637 [00:07<00:00, 79.99it/s]
Epoch: 1/2:  10%|█         | 32/319 [00:56<08:25,  1.76s/it]

Loss after batch 31: 2.528225202714243


Epoch: 1/2:  20%|█▉        | 63/319 [01:47<06:55,  1.62s/it]

Loss after batch 62: 2.211847561021005


Epoch: 1/2:  29%|██▉       | 94/319 [02:44<06:20,  1.69s/it]

Loss after batch 93: 2.0924722597163212


Epoch: 1/2:  39%|███▉      | 125/319 [03:39<05:59,  1.85s/it]

Loss after batch 124: 2.0330332575305814


Epoch: 1/2:  49%|████▉     | 156/319 [04:30<04:23,  1.62s/it]

Loss after batch 155: 1.990711002196035


Epoch: 1/2:  59%|█████▊    | 187/319 [05:21<03:44,  1.70s/it]

Loss after batch 186: 1.951682622073799


Epoch: 1/2:  68%|██████▊   | 218/319 [06:12<02:47,  1.66s/it]

Loss after batch 217: 1.9319304008088354


Epoch: 1/2:  78%|███████▊  | 249/319 [07:05<02:02,  1.76s/it]

Loss after batch 248: 1.9268098050548184


Epoch: 1/2:  88%|████████▊ | 280/319 [08:01<01:07,  1.72s/it]

Loss after batch 279: 1.9042944433868572


Epoch: 1/2:  97%|█████████▋| 311/319 [09:00<00:14,  1.80s/it]

Loss after batch 310: 1.8943292779307213


Epoch: 1/2: 100%|██████████| 319/319 [09:14<00:00,  1.74s/it]


Loss after epoch 0: 1.8886803235379879


100%|██████████| 160/160 [00:01<00:00, 106.89it/s]
  input_ids = input_ids.repeat_interleave(expand_size, dim=0)
  sent_lengths_max = sent_lengths.max().item() + 1
100%|██████████| 160/160 [02:59<00:00,  1.12s/it]
  ylabel_test = np.asarray(tmp_label_test)


ValueError: could not broadcast input array from shape (9,) into shape (6,)

In [27]:
yhat_test


array([['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '6', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '6', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5', '8', 'S'],
       ['B', '1', '8', '5

In [26]:
model.save_pretrained(MODEL_NAME)

In [37]:

processor.tokenizer(augment_original_label[1], padding="max_length", max_length=12).input_ids

[0, 387, 1225, 846, 1889, 2, 1, 1, 1, 1, 1, 1]

In [27]:
del model

In [35]:
augment_original_label[2]

'B1472PQH'

##### Model Evaluate

In [26]:
from tqdm import tqdm
def manual_eval(model, processor, images):
    output = []
    px_vals = []
    for image in tqdm(images):
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
        px_vals.append(pixel_values)
    for pixel_values in tqdm(px_vals):
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        output.append(generated_text)
    return np.asarray(output)

In [27]:
# bdc_model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
# 

In [24]:
from metrics import character_accuracy_np

# yhat_train = manual_eval(model=bdc_model, processor=processor, images = train_image_array)
# tmp_hat = [np.asarray(list(x)) for x in yhat_train]
# yhat_train = np.asarray(tmp_hat)
# print(f"Train dataset char_acc = {character_accuracy_np(train_label_array, yhat_train)}")

yhat_test = manual_eval(model=model, processor=processor, images = test_img_array)
tmp_hat_test = [np.asarray(list(x)) for x in yhat_test]
yhat_test = np.asarray(tmp_hat_test)
tmp_label_test = [np.asarray(list(x)) for x in test_label_array]
ylabel_test = np.asarray(tmp_label_test)
print(f"test dataset char_acc = {character_accuracy_np(ylabel_test, yhat_test)}")

100%|██████████| 160/160 [00:01<00:00, 151.92it/s]
100%|██████████| 160/160 [01:09<00:00,  2.30it/s]

test dataset char_acc = 0.920138888888889



  yhat_test = np.asarray(tmp_hat_test)
  ylabel_test = np.asarray(tmp_label_test)


In [24]:
df_gt = pd.read_csv('ground_truth_manual.csv')
gt = df_gt['Ground Truth'].to_numpy()
gt_test_path_array = df_gt['Name of File'].to_numpy()

gt_test_image = PreprocessImage(img_paths=gt_test_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='test', include_original=False)
gt_test_img_array = gt_test_image.img_array.copy()
yhat_test = manual_eval(model=model, processor=processor, images = gt_test_img_array)
tmp_hat_test = [np.asarray(list(x)) for x in yhat_test]
yhat_test = np.asarray(tmp_hat_test)
tmp_label_test = [np.asarray(list(x)) for x in gt]
ylabel_test = np.asarray(tmp_label_test)
print(f"test dataset compared to GroundTruth char_acc = {char_acc(ylabel_test, yhat_test)}")


Initial Preprocess: Add Padding: 100%|██████████| 100/100 [00:00<00:00, 780.82it/s]
100%|██████████| 100/100 [00:00<00:00, 157.33it/s]
100%|██████████| 100/100 [00:31<00:00,  3.16it/s]

test dataset compared to GroundTruth char_acc = 0.8811111111111112



  yhat_test = np.asarray(tmp_hat_test)
  ylabel_test = np.asarray(tmp_label_test)


In [26]:
del model

### Notes & Other Takeaways From This Project
****
- The results were pretty good. I was pondering whether to train for 2 or 3 epochs. Ultimately, I trained this model for 2 epochs. If this were a work project (where multiprocessing and other options are available), I would have trained for 3, if not 4, epochs.
****

### Citations

##### For Transformer Checkpoint
- @misc{li2021trocr,
      title={TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models}, 
      author={Minghao Li and Tengchao Lv and Lei Cui and Yijuan Lu and Dinei Florencio and Cha Zhang and Zhoujun Li and Furu Wei},
      year={2021},
      eprint={2109.10282},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

##### For CER Metric
- @inproceedings{morris2004,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}