<a href="https://colab.research.google.com/github/mayourbukhari/AI-and-ML-Resources/blob/main/Mohsin_Bukhari_OCR_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install transformers datasets torch torchvision opencv-python pillow evaluate jiwer kagglehub

import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import DataLoader
import cv2
import numpy as np
from PIL import Image
import evaluate
import kagglehub
import os




In [None]:
# 1. Setup processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 1024,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_

In [None]:
# 2. Load datasets
# Download Imgur5K-sample dataset
print("Downloading Imgur5K dataset...")
imgur_path = kagglehub.dataset_download("andrianang/imgur5k-sample")
print("Path to Imgur5K files:", imgur_path)

# Load IAM dataset (Hugging Face or manual upload)
try:
    iam_dataset = load_dataset("handwriting-iam", split="train")  # Adjust split if needed
except:
    print("IAM not found on Hugging Face. Upload manually.")
    from google.colab import files
    uploaded = files.upload()  # Upload IAM dataset files
    iam_dataset = load_dataset("path_to_iam", split="train")  # Adjust path

# Load Imgur5K dataset
# Note: Imgur5K-sample may require custom loading due to its structure
# Assuming it contains images and annotations (adjust based on actual dataset)
from datasets import Dataset, Image as DatasetImage
imgur_images = []
imgur_texts = []
# Example: Adjust based on actual file structure in imgur_path
for root, _, files in os.walk(imgur_path):
    for file in files:
        if file.endswith(('.png', '.jpg', '.jpeg')):  # Image files
            img_path = os.path.join(root, file)
            imgur_images.append(img_path)
            # Placeholder: Assume text annotations are in a CSV or JSON
            # You'll need to load actual text labels from the dataset
            imgur_texts.append("sample_text")  # Replace with actual text extraction

imgur_dataset = Dataset.from_dict({
    "image_path": imgur_images,
    "text": imgur_texts
}).cast_column("image_path", DatasetImage())

# Combine datasets
dataset = concatenate_datasets([iam_dataset, imgur_dataset])

# Split dataset
train_dataset = dataset.select(range(int(0.8 * len(dataset))))  # 80% train
val_dataset = dataset.select(range(int(0.8 * len(dataset)), int(0.9 * len(dataset))))  # 10% val
test_dataset = dataset.select(range(int(0.9 * len(dataset)), len(dataset)))  # 10% test

Downloading Imgur5K dataset...
Path to Imgur5K files: /kaggle/input/imgur5k-sample
IAM not found on Hugging Face. Upload manually.


In [None]:

# 3. Preprocessing
def preprocess_data(examples):
    images = []
    for img_path in examples["image_path"]:
        img = Image.open(img_path).convert("L")  # Grayscale
        img = img.resize((384, 384))  # Resize for TrOCR
        img = cv2.GaussianBlur(np.array(img), (3, 3), 0)  # Noise reduction
        images.append(Image.fromarray(img))
    pixel_values = processor(images, return_tensors="pt").pixel_values
    labels = processor.tokenizer(examples["text"], padding="max_length", max_length=128).input_ids
    return {"pixel_values": pixel_values, "labels": labels}

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=["image_path", "text"])
val_dataset = val_dataset.map(preprocess_data, batched=True, remove_columns=["image_path", "text"])
test_dataset = test_dataset.map(preprocess_data, batched=True, remove_columns=["image_path", "text"])

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Batch size for T4
val_dataloader = DataLoader(val_dataset, batch_size=4)
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [None]:
# 4. Fine-tuning setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = torch.cuda.amp.GradScaler()  # Mixed precision

In [None]:

# 5. Training loop
model.train()
for epoch in range(3):  # 3 epochs for 1-hour limit; increase to 10 if time allows
    for batch in train_dataloader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)
        with torch.cuda.amp.autocast():
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [None]:
# 6. Evaluation
cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")
model.eval()
predictions, references = [], []
for batch in test_dataloader:
    pixel_values = batch["pixel_values"].to(device)
    with torch.no_grad():
        outputs = model.generate(pixel_values)
    pred_texts = processor.batch_decode(outputs, skip_special_tokens=True)
    ref_texts = processor.batch_decode(batch["labels"], skip_special_tokens=True)
    predictions.extend(pred_texts)
    references.extend(ref_texts)

cer_score = cer_metric.compute(predictions=predictions, references=references)
wer_score = wer_metric.compute(predictions=predictions, references=references)
print(f"CER: {cer_score*100:.2f}%, WER: {wer_score*100:.2f}%")

In [None]:
# 7. Save model
model.save_pretrained("fine_tuned_trocr")
processor.save_pretrained("fine_tuned_trocr")
torch.save(model.state_dict(), "fine_tuned_trocr.pth")

In [None]:

# 8. Download model
from google.colab import files
!zip -r fine_tuned_trocr.zip fine_tuned_trocr
files.download("fine_tuned_trocr.zip")



Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 1024,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_

Downloading Imgur5K dataset...


ValueError: Invalid dataset handle: https://www.kaggle.com/datasets/andrianang/imgur5k-sample