## TrOCR Fine Tune


##### Install Necessary Libraries

In [1]:
# %pip install -q torch torchvision torchaudio
# %pip install -q datasets jiwer

##### Import Necessary Libraries

In [2]:
import os, sys, itertools
os.environ['TOKENIZERS_PARALLELISM']='false'
# export CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

import cv2
from PIL import Image

import torch
from torch.utils.data import Dataset

import datasets
from datasets import load_dataset

import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import VisionEncoderDecoderModel, TrOCRProcessor, default_data_collator
from datasets import load_metric


from preprocess_image import PreprocessImage
from augment_image import AugmentImage

  from .autonotebook import tqdm as notebook_tqdm


##### Ingest, Preprocess, & Split Dataset (into Training & Testing Datasets)

In [3]:
df_train = pd.read_csv('train/DataTrain.csv', delimiter=';')
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_train = df_train.drop([126, 457, 600]) # delete the wrong labeled data
df_train.head()

Unnamed: 0,Vehicleregistrationplate,NameofFile
0,A7814,DataTrain1.png
1,B1074QO,DataTrain2.png
2,B1031QO,DataTrain3.png
3,B187EDA,DataTrain4.png
4,B1089VD,DataTrain5.png


##### Show First Samples in DataFrame

In [4]:
df_train.head(12)

Unnamed: 0,Vehicleregistrationplate,NameofFile
0,A7814,DataTrain1.png
1,B1074QO,DataTrain2.png
2,B1031QO,DataTrain3.png
3,B187EDA,DataTrain4.png
4,B1089VD,DataTrain5.png
5,B1972RBP,DataTrain6.png
6,AB2400WU,DataTrain7.png
7,AB6268YQ,DataTrain8.png
8,A8014VA,DataTrain9.png
9,B1554EJA,DataTrain10.png


##### Prepare the image and label dataset

###### Prepare Label & Images

In [5]:
label_array = df_train['Vehicleregistrationplate'].to_numpy()
path_train = df_train['NameofFile'].to_numpy()

###### Train val split

In [6]:
train_path_array, test_path_array, train_label_array, test_label_array = train_test_split(path_train, label_array, test_size=0.2, random_state=42)

###### Preprocess Test Images

In [7]:
test_image = PreprocessImage(img_paths=test_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=False)
test_img_array = test_image.img_array.copy()

Initial Preprocess: Add Padding: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 160/160 [00:00<00:00, 724.86it/s]


###### Preprocess & Augment Train Images

In [8]:
train_image = PreprocessImage(img_paths=train_path_array, output_pixel=384, padding_ratio=0.9, path_prefix='train', include_original=True)
train_image_array = train_image.gray_image(masked_img=np.array([]))
print(train_image_array.shape)

Initial Preprocess: Add Padding: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 637/637 [00:00<00:00, 725.65it/s]
Preprocess: Convert to Grayscale Image: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 637/637 [00:00<00:00, 3057.75it/s]


(1274, 384, 384, 3)


In [9]:
train_label_array = np.concatenate((train_label_array, train_label_array), axis=0)

In [10]:
train_label_array.shape

(1274,)

In [11]:
augmenter = AugmentImage(image_array=train_image_array, label_array=train_label_array, num_augmentations=3)
train_image_array, train_label_array = augmenter.transform()
print(train_image_array.shape)
print(train_label_array.shape)

  warn_deprecated(msg, stacklevel=3)
Augement Images: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1274/1274 [00:38<00:00, 33.45it/s]


(5096, 384, 384, 3)
(5096,)


##### Create Dataset Class

In [12]:
class BDC_Dataset(Dataset):
    
    def __init__(self, img_array, label_array, processor, max_target_length=12):
        self.img_array = img_array
        self.label_array = label_array
        self.processor = processor
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.img_array)

    def __getitem__(self, idx):
        # get iimage + label
        image = self.img_array[idx]
        label = self.label_array[idx]
        # prepare image (i.e. resize + normalize)
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        pixel_values = self.processor(image, return_tensors="pt").pixel_values.to('mps:0')
        # add labels (input_ids) by encoding the label
        labels = self.processor.tokenizer(label, padding="max_length", max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id 
                  else -100 for label in labels]
        
        encoding = {"pixel_values" : pixel_values.squeeze(), "labels" : torch.tensor(labels).to('mps:0')}
        return encoding

##### Basic Values/Constants

In [13]:
MODEL_CKPT = "microsoft/trocr-base-printed"
MODEL_NAME =  MODEL_CKPT.split("/")[-1] + "_bdc_license_plates_ocr"
NUM_OF_EPOCHS = 2

##### Instantiate Processor, Create Training, & Testing Dataset Instances

In [14]:
processor = TrOCRProcessor.from_pretrained(MODEL_CKPT)

train_ds = BDC_Dataset(train_image_array, train_label_array, processor=processor)

test_ds = BDC_Dataset(test_img_array, test_label_array, processor=processor)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


##### Print Length of Training & Testing Datasets

In [15]:
print(f"The training dataset has {len(train_ds)} samples in it.")
print(f"The testing dataset has {len(test_ds)} samples in it.")

The training dataset has 5096 samples in it.
The testing dataset has 160 samples in it.


##### Example of Input Data Shapes

In [16]:
encoding = train_ds[0]

for k,v in encoding.items():
    print(k, " : ", v.shape)

pixel_values  :  torch.Size([3, 384, 384])
labels  :  torch.Size([12])


##### Show Example

In [16]:
# image = Image.open(train_ds.root_dir + train_dataset['file_name'][0]).convert("RGB")

# image

##### Show Label for Above Example

In [17]:
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

B1260TZT


#### Instantiate Model

In [18]:
model = VisionEncoderDecoderModel.from_pretrained(MODEL_CKPT)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Model Configuration Modifications

In [19]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 12
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

##### Define Metrics Evaluation

In [20]:
cer_metric = load_metric("cer")

def compute_metrics(pred):
    label_ids = pred.label_ids
    pred_ids = pred.predictions
    char_acc = np.mean([np.mean(ref == pred) for ref, pred in zip(label_ids, pred_ids)])

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    return {"char_acc": char_acc, "cer" : cer}

  cer_metric = load_metric("cer")


##### Define Training Arguments

In [25]:
args = Seq2SeqTrainingArguments(
    output_dir = MODEL_NAME,
    num_train_epochs=NUM_OF_EPOCHS,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_first_step=True,
    hub_private_repo=True,
    push_to_hub=True,
    use_mps_device=False,
    # fp16=False,
    no_cuda=True
)

##### Define Trainer

In [26]:
trainer = Seq2SeqTrainer(
    model=model.to('mps:0'),
    tokenizer=processor.feature_extractor,
    args=args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=default_data_collator,
    
)

/Users/mdaniyalk/Documents/github/work/BDC-2023/trocr-base-printed_bdc_license_plates_ocr is already a clone of https://huggingface.co/mdaniyalk/trocr-base-printed_bdc_license_plates_ocr. Make sure you pull the latest changes with `repo.git_pull()`.


In [23]:
# !huggingface-cli login --token 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

##### Fit/Train Model

In [27]:
train_results = trainer.train()



                                       
  0%|          | 0/320 [01:01<?, ?it/s]           

{'loss': 9.9955, 'learning_rate': 4.996075353218211e-05, 'epoch': 0.0}




KeyboardInterrupt: 

##### Save Model & Training Metrics

In [66]:
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Saving model checkpoint to trocr-base-printed_license_plates_ocr
Configuration saved in trocr-base-printed_license_plates_ocr/config.json
Model weights saved in trocr-base-printed_license_plates_ocr/pytorch_model.bin
Feature extractor saved in trocr-base-printed_license_plates_ocr/preprocessor_config.json
Saving model checkpoint to trocr-base-printed_license_plates_ocr
Configuration saved in trocr-base-printed_license_plates_ocr/config.json
Model weights saved in trocr-base-printed_license_plates_ocr/pytorch_model.bin
Feature extractor saved in trocr-base-printed_license_plates_ocr/preprocessor_config.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/1.24G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/DunnBC22/trocr-base-printed_license_plates_ocr
   2ea0ec3..f8584da  main -> main



***** train metrics *****
  epoch                    =                 2.0
  train_loss               =              0.3359
  train_runtime            = 8 days, 23:01:49.71
  train_samples_per_second =               0.041
  train_steps_per_second   =               0.005


##### Evaluate Model

In [67]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 4000
  Batch size = 8


  0%|          | 0/500 [00:00<?, ?it/s]

***** eval metrics *****
  epoch                   =         2.0
  eval_cer                =      0.0368
  eval_loss               =      0.1581
  eval_runtime            = 15:19:38.93
  eval_samples_per_second =       0.072
  eval_steps_per_second   =       0.009


##### Push Model to Hub (My Profile!!!)

In [68]:
kwargs = {
    "finetuned_from" : model.config._name_or_path,
    "tasks" : "image-to-text",
    "tags" : ["image-to-text"],
}

if args.push_to_hub:
    trainer.push_to_hub("All Dunn!!!")
else:
    trainer.create_model_card(**kwargs)

Saving model checkpoint to trocr-base-printed_license_plates_ocr
Configuration saved in trocr-base-printed_license_plates_ocr/config.json
Model weights saved in trocr-base-printed_license_plates_ocr/pytorch_model.bin
Feature extractor saved in trocr-base-printed_license_plates_ocr/preprocessor_config.json


Upload file .DS_Store: 100%|##########| 6.00k/6.00k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/DunnBC22/trocr-base-printed_license_plates_ocr
   f8584da..3b058e3  main -> main



### Notes & Other Takeaways From This Project
****
- The results were pretty good. I was pondering whether to train for 2 or 3 epochs. Ultimately, I trained this model for 2 epochs. If this were a work project (where multiprocessing and other options are available), I would have trained for 3, if not 4, epochs.
****

### Citations

##### For Transformer Checkpoint
- @misc{li2021trocr,
      title={TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models}, 
      author={Minghao Li and Tengchao Lv and Lei Cui and Yijuan Lu and Dinei Florencio and Cha Zhang and Zhoujun Li and Furu Wei},
      year={2021},
      eprint={2109.10282},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

##### For CER Metric
- @inproceedings{morris2004,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}