## TrOCR Fine Tune


##### Install Necessary Libraries

In [None]:
# %pip install -q torch torchvision torchaudio
# %pip install -q datasets jiwer

##### Import Necessary Libraries

In [None]:
import os, sys, itertools
os.environ['TOKENIZERS_PARALLELISM']='false'
# export CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

import cv2
from PIL import Image

import torch
from torch.utils.data import Dataset

import datasets
from datasets import load_dataset

import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import VisionEncoderDecoderModel, TrOCRProcessor, default_data_collator
from datasets import load_metric


from preprocess_image import PreprocessImage
from augment_image import AugmentImage

##### Ingest, Preprocess, & Split Dataset (into Training & Testing Datasets)

In [None]:
df_train = pd.read_csv('train/DataTrain.csv', delimiter=';')
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_train = df_train.drop([126, 457, 600]) # delete the wrong labeled data
df_train.head()

##### Show First Samples in DataFrame

In [None]:
df_train.head(12)

##### Prepare the image and label dataset

###### Prepare Label & Images

In [None]:
label_array = df_train['Vehicleregistrationplate'].to_numpy()
path_train = df_train['NameofFile'].to_numpy()

###### Train val split

In [None]:
train_path_array, test_path_array, train_label_array, test_label_array = train_test_split(path_train, label_array, test_size=0.2, random_state=42)

###### Preprocess Test Images

In [None]:
test_image = PreprocessImage(img_paths=test_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=False)
test_img_array = test_image.img_array.copy()

###### Preprocess & Augment Train Images

In [None]:
train_image = PreprocessImage(img_paths=train_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=True)
# train_image_array = train_image.gray_image(masked_img=np.array([]))
train_img_array = train_image.img_array.copy()
print(train_img_array.shape)

In [None]:
# train_label_array = np.concatenate((train_label_array, train_label_array), axis=0)

In [None]:
train_label_array.shape

In [None]:
augmenter = AugmentImage(image_array=train_img_array, label_array=train_label_array, num_augmentations=5)
train_image_array, train_label_array = augmenter.transform()
print(train_image_array.shape)
print(train_label_array.shape)

##### Create Dataset Class

In [None]:
class BDC_Dataset(Dataset):
    
    def __init__(self, img_array, label_array, processor, max_target_length=12):
        self.img_array = img_array
        self.label_array = label_array
        self.processor = processor
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.img_array)

    def __getitem__(self, idx):
        # get iimage + label
        image = self.img_array[idx]
        label = self.label_array[idx]
        # prepare image (i.e. resize + normalize)
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = self.processor(image, return_tensors="pt").pixel_values.to('mps:0')
        # add labels (input_ids) by encoding the label
        labels = self.processor.tokenizer(label, padding="max_length", max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id 
                  else -100 for label in labels]
        
        encoding = {"pixel_values" : pixel_values.squeeze(), "labels" : torch.tensor(labels).to('mps:0')}
        return encoding

##### Basic Values/Constants

In [None]:
MODEL_CKPT = "microsoft/trocr-base-printed"
MODEL_NAME =  MODEL_CKPT.split("/")[-1] + "_bdc_license_plates_ocr"
NUM_OF_EPOCHS = 2

##### Instantiate Processor, Create Training, & Testing Dataset Instances

In [None]:
processor = TrOCRProcessor.from_pretrained(MODEL_CKPT)

train_ds = BDC_Dataset(train_image_array, train_label_array, processor=processor)

test_ds = BDC_Dataset(test_img_array, test_label_array, processor=processor)

##### Print Length of Training & Testing Datasets

In [None]:
print(f"The training dataset has {len(train_ds)} samples in it.")
print(f"The testing dataset has {len(test_ds)} samples in it.")

##### Example of Input Data Shapes

In [None]:
encoding = train_ds[0]

for k,v in encoding.items():
    print(k, " : ", v.shape)

##### Show Example

In [None]:
# image = Image.open(train_ds.root_dir + train_dataset['file_name'][0]).convert("RGB")

# image

##### Show Label for Above Example

In [None]:
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

#### Instantiate Model

In [None]:
model = VisionEncoderDecoderModel.from_pretrained(MODEL_CKPT)

##### Model Configuration Modifications

In [None]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 12
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

##### Define Metrics Evaluation

In [None]:
from metrics import character_accuracy_np as char_acc 

cer_metric = load_metric("cer")

def compute_metrics(pred):
    label_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    tmp_hat_test = [np.asarray(list(x)) for x in pred_str]
    pred_str = np.asarray(tmp_hat_test)
    tmp_label_test = [np.asarray(list(x)) for x in label_str]
    label_str = np.asarray(tmp_label_test)
    char_acc = char_acc(label_str, pred_str)

    return {"char_acc": char_acc, "cer" : cer}

##### Define Training Arguments

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir = MODEL_NAME,
    num_train_epochs=NUM_OF_EPOCHS,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_first_step=True,
    # hub_private_repo=True,
    # push_to_hub=True,
    use_mps_device=False,
    fp16=False,
    no_cuda=True
)

##### Define Trainer

In [None]:
trainer = Seq2SeqTrainer(
    model=model.to('mps:0'),
    tokenizer=processor.feature_extractor,
    args=args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=default_data_collator,
    
)

In [None]:
# !huggingface-cli login --token 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

##### Fit/Train Model

In [None]:
train_results = trainer.train()

##### Save Model & Training Metrics

In [None]:
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

##### Evaluate Model

In [None]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

##### Push Model to Hub (My Profile!!!)

In [None]:
# kwargs = {
#     "finetuned_from" : model.config._name_or_path,
#     "tasks" : "image-to-text",
#     "tags" : ["image-to-text"],
# }

# if args.push_to_hub:
#     trainer.push_to_hub("All Dunn!!!")
# else:
#     trainer.create_model_card(**kwargs)

##### Model Evaluate

In [None]:
def manual_eval(trainer, processor, dataset):
    output = trainer.predict(dataset)
    outputs = []
    for out in output.predictions:
        generated_text = processor.batch_decode(out, skip_special_tokens=True)[0]
        outputs.append(generated_text)
    return np.asarray(outputs)

In [None]:
from metrics import character_accuracy_np as char_acc 

# yhat_train = manual_eval(trainer=trainer, processor=processor, dataset = train_ds)
# print(f"Train dataset char_acc = {char_acc(train_label_array, yhat_train)}")

yhat_test = manual_eval(trainer=trainer, processor=processor, dataset = test_ds)
print(f"test dataset char_acc = {char_acc(test_label_array, yhat_test)}")

In [None]:
from tqdm import tqdm
def manual_eval(model, processor, images):
    output = []
    px_vals = []
    for image in tqdm(images):
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = processor(image, return_tensors="pt").pixel_values.to('cpu')
        px_vals.append(pixel_values)
    for pixel_values in tqdm(px_vals):
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        output.append(generated_text)
    return np.asarray(output)

In [None]:
bdc_model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)


In [None]:
from metrics import character_accuracy_np

# yhat_train = manual_eval(model=bdc_model, processor=processor, images = train_image_array)
# tmp_hat = [np.asarray(list(x)) for x in yhat_train]
# yhat_train = np.asarray(tmp_hat)
# print(f"Train dataset char_acc = {character_accuracy_np(train_label_array, yhat_train)}")

yhat_test = manual_eval(model=bdc_model, processor=processor, images = test_img_array)
tmp_hat_test = [np.asarray(list(x)) for x in yhat_test]
yhat_test = np.asarray(tmp_hat_test)
tmp_label_test = [np.asarray(list(x)) for x in test_label_array]
ylabel_test = np.asarray(tmp_label_test)
print(f"test dataset char_acc = {character_accuracy_np(ylabel_test, yhat_test)}")

### Notes & Other Takeaways From This Project
****
- The results were pretty good. I was pondering whether to train for 2 or 3 epochs. Ultimately, I trained this model for 2 epochs. If this were a work project (where multiprocessing and other options are available), I would have trained for 3, if not 4, epochs.
****

### Citations

##### For Transformer Checkpoint
- @misc{li2021trocr,
      title={TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models}, 
      author={Minghao Li and Tengchao Lv and Lei Cui and Yijuan Lu and Dinei Florencio and Cha Zhang and Zhoujun Li and Furu Wei},
      year={2021},
      eprint={2109.10282},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

##### For CER Metric
- @inproceedings{morris2004,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}