<a href="https://colab.research.google.com/github/lyra29/NRCOCRTr/blob/main/TrOCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets sentencepiece timm




In [None]:
import os

# location in Google Drive to store TrOCR model weights
cache_dir = "/content/drive/MyDrive/trocr_cache"
os.environ["TRANSFORMERS_CACHE"] = cache_dir

print("Cache directory:", cache_dir)


Cache directory: /content/drive/MyDrive/trocr_cache


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/trocr_cache/myanmar_digits/labels.csv", encoding="utf-8")
print(df.head())
print(type(df.loc[0, "text"]), df.loc[0, "text"])


        image text
0  img1_0.jpg    ၀
1  img1_1.jpg    ၁
2  img1_2.jpg    ၂
3  img1_3.jpg    ၃
4  img1_4.jpg    ၄
<class 'str'> ၀


Loading csv and dataset class

In [None]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset

DATASET_DIR = "/content/drive/MyDrive/trocr_cache/myanmar_digits"
CSV_PATH = os.path.join(DATASET_DIR, "labels.csv")

df = pd.read_csv(CSV_PATH, encoding="utf-8")
print("Total samples:", len(df))
print(df.head())


Total samples: 240
        image text
0  img1_0.jpg    ၀
1  img1_1.jpg    ၁
2  img1_2.jpg    ၂
3  img1_3.jpg    ၃
4  img1_4.jpg    ၄


Dataset class for TrOCR

In [None]:
from transformers import TrOCRProcessor

class MyanmarDigitDataset(Dataset):
    def __init__(self, dataframe, dataset_dir, processor):
        self.df = dataframe
        self.dataset_dir = dataset_dir
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_path = os.path.join(self.dataset_dir, "images", row["image"])

        image = Image.open(img_path).convert("RGB")
        text = str(row["text"])

        encoding = self.processor(
            images=image,
            text=text,
            padding="max_length",
           # padding=False,
           #truncation=True,
            return_tensors="pt"
        )

        return {k: v.squeeze(0) for k, v in encoding.items()}


Load model+Processor

In [None]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer

MODEL_NAME = "microsoft/trocr-base-handwritten"

processor = TrOCRProcessor.from_pretrained(MODEL_NAME, cache_dir=cache_dir)
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME, cache_dir=cache_dir)

tokenizer = processor.tokenizer
myanmar_digits = ["၀","၁","၂","၃","၄","၅","၆","၇","၈","၉"]

tokenizer.add_tokens(myanmar_digits)
model.decoder.resize_token_embeddings(len(tokenizer))


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


TrOCRScaledWordEmbedding(50275, 1024, padding_idx=1)

Recreate processor with updated tokenizer

In [None]:
#from transformers import TrOCRProcessor

# After adding Myanmar digits
processor = TrOCRProcessor(
    feature_extractor=processor.feature_extractor,  # reuse feature extractor
    tokenizer=tokenizer                            # updated tokenizer with Myanmar digits
)




Fix decoder start token

In [None]:
# 2️⃣ Fix decoder start token
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

freeze vision encoder because Trocr is for long text and not for digits so it can cause overfitting for digit

In [None]:
# Freeze vision encoder to prevent overfitting
for param in model.encoder.parameters():
    param.requires_grad = False


Create dataset

In [None]:
dataset = MyanmarDigitDataset(df, DATASET_DIR, processor)


In [None]:
import os

print(os.listdir("/content/drive/MyDrive/trocr_cache/myanmar_digits"))
print(os.listdir("/content/drive/MyDrive/trocr_cache/myanmar_digits/images")[:5])


['labels.csv', 'images']
['img35_3.jpg', 'img35_2.jpg', 'img35_1.jpg', 'img35_0.jpg', 'img31_9.jpg']


Fine Tune(Trainer)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/trocr_cache/trocr_myanmar_digits",
    #per_device_train_batch_size=2,
    #num_train_epochs=30,
    #learning_rate=5e-5,
    #logging_steps=5,
    #save_steps=100,
    per_device_train_batch_size=4,   # safer for small data(118 photos)
    #num_train_epochs=8,# enough for fine-tuning
    num_train_epochs=3,
    learning_rate=3e-5,              # lower = more stable

    logging_steps=10,
    #save_steps=5000,
    save_steps=10000,
    save_total_limit=1,

    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


Step,Training Loss
10,0.0069
20,0.0061
30,0.0056
40,0.0053
50,0.0051
60,0.0049
70,0.0048
80,0.0049
90,0.0046
100,0.0048


TrainOutput(global_step=180, training_loss=0.00499701104644272, metrics={'train_runtime': 113.1073, 'train_samples_per_second': 6.366, 'train_steps_per_second': 1.591, 'total_flos': 5.3876533751709696e+17, 'train_loss': 0.00499701104644272, 'epoch': 3.0})

In [None]:
# Test immediately
from PIL import Image
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

img_path = "/content/drive/MyDrive/trocr_cache/myanmar_digits_test/test3.jpg"
image = Image.open(img_path).convert("RGB")
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

model.eval()
with torch.no_grad():
    generated_ids = model.generate(pixel_values, max_length=2,num_beams=1)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(text)

၃


save the model

In [None]:
# Define save directory on your Drive
save_dir = "/content/drive/MyDrive/trocr_cache/myanmar_digits_model_final"

# Save model weights
model.save_pretrained(save_dir)

# Save processor (includes tokenizer and feature extractor)
processor.save_pretrained(save_dir)

print("Model successfully saved to:", save_dir)


Model successfully saved to: /content/drive/MyDrive/trocr_cache/myanmar_digits_model_final


load the saved model from google drive

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
save_dir = "/content/drive/MyDrive/trocr_cache/myanmar_digits_model_final"

model = VisionEncoderDecoderModel.from_pretrained(save_dir,local_files_only=True)
processor = TrOCRProcessor.from_pretrained(save_dir,local_files_only=True)

model.to("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

testing the saved model

In [None]:
from PIL import Image
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load a test image
img_path = "/content/drive/MyDrive/trocr_cache/myanmar_digits/images/img9_4.jpg"
image = Image.open(img_path).convert("RGB")

# Process the image
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

# Generate prediction
model.eval()
with torch.no_grad():
    generated_ids = model.generate(pixel_values, max_length=2, num_beams=1)

# Decode the predicted text
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Predicted Text:", text)


Predicted Text: ၄
