In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms

from PIL import Image
from tqdm.notebook import tqdm
from datetime import datetime
from datasets import load_dataset
from torch.cuda.amp import GradScaler, autocast

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

dataset = load_dataset("Teklia/IAM-line")["train"]
os.makedirs("temp_images", exist_ok=True)

data, vocab, max_len = [], set(), 0
for idx, example in tqdm(enumerate(dataset), total=len(dataset)):
    image = example["image"]
    label = example["text"]
    image_path = os.path.join("temp_images", f"{idx}.png")
    image.save(image_path)
    vocab.update(label)
    max_len = max(max_len, len(label))
    data.append([image_path, label])


Using device: cuda


  0%|          | 0/6482 [00:00<?, ?it/s]

In [2]:
class ModelConfigs:
    def __init__(self):
        timestamp = datetime.strftime(datetime.now(), "%Y%m%d%H%M")
        self.model_path = os.path.join("Models/CRNN", timestamp)
        os.makedirs(self.model_path, exist_ok=True)
        self.vocab = "".join(sorted(vocab))
        self.height = 64
        self.max_width = 2048
        self.max_text_length = max_len
        self.batch_size = 4
        self.learning_rate = 0.0003
        self.train_epochs = 50
        self.device = device

configs = ModelConfigs()


In [3]:
class ResizeKeepAspect:
    def __init__(self, height, max_width=None):
        self.height = height
        self.max_width = max_width

    def __call__(self, img):
        w, h = img.size
        new_w = min(int(w * (self.height / h)), self.max_width)
        return img.resize((new_w, self.height), Image.BILINEAR)

augmentation = transforms.RandomApply([
    transforms.RandomRotation(5),
    transforms.RandomPerspective(0.2, p=1.0),
    transforms.ColorJitter(0.3, 0.3)
], p=0.5)

transform = transforms.Compose([
    transforms.Grayscale(1),
    augmentation,
    ResizeKeepAspect(configs.height, configs.max_width),
    transforms.ToTensor(),
])

class IAMDataset(Dataset):
    def __init__(self, data, vocab, transform=None):
        self.data = data
        self.vocab = vocab
        self.transform = transform
        self.char_to_idx = {ch: i for i, ch in enumerate(vocab)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path, text = self.data[idx]
        image = Image.open(image_path).convert("L")
        if self.transform:
            image = self.transform(image)
        label = torch.tensor([self.char_to_idx[ch] for ch in text], dtype=torch.long)
        return image, label, len(label)

def collate_fn(batch):
    images, labels, lengths = zip(*batch)
    max_w = max(img.shape[2] for img in images)
    images = [F.pad(img, (0, max_w - img.shape[2])) for img in images]
    images = torch.stack(images)
    max_len = max(lengths)
    padded_labels = torch.full((len(labels), max_len), -1, dtype=torch.long)
    for i, label in enumerate(labels):
        padded_labels[i, :len(label)] = label
    return images, padded_labels, torch.tensor(lengths)


In [4]:
split = int(0.9 * len(data))
train_loader = DataLoader(IAMDataset(data[:split], configs.vocab, transform), batch_size=configs.batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(IAMDataset(data[split:], configs.vocab, transform), batch_size=configs.batch_size, shuffle=False, collate_fn=collate_fn)

class CRNN(nn.Module):
    def __init__(self, vocab_size, hidden=512, dropout=0.3):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.Dropout(dropout),
            nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.Dropout(dropout),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.Dropout(dropout),
            nn.Conv2d(128, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.Dropout(dropout),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.Dropout(dropout),
            nn.Conv2d(256, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.Dropout(dropout),
        )
        self.lstm = nn.LSTM(input_size=256 * (configs.height // 4), hidden_size=hidden,
                            num_layers=2, bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden * 2, vocab_size + 1)

    def forward(self, x):
        x = self.conv(x)
        b, c, h, w = x.size()
        x = x.permute(0, 3, 1, 2).reshape(b, w, c * h)
        x, _ = self.lstm(x)
        return self.fc(x).permute(1, 0, 2)

model = CRNN(len(configs.vocab)).to(configs.device)
criterion = nn.CTCLoss(blank=len(configs.vocab))
optimizer = optim.Adam(model.parameters(), lr=configs.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True)
scaler = GradScaler()


  scaler = GradScaler()


In [5]:
def train(model, train_loader, val_loader, configs):
    best_val = float("inf")
    for epoch in range(1, configs.train_epochs + 1):
        model.train()
        total_loss = 0
        for images, labels, label_lens in tqdm(train_loader, desc=f"Epoch {epoch}"):
            images, labels = images.to(configs.device), labels.to(configs.device)
            label_lens = label_lens.to(configs.device)
            optimizer.zero_grad(set_to_none=True)
            with autocast():
                output = model(images)
                input_lens = torch.full((output.size(1),), output.size(0), dtype=torch.long).to(configs.device)
                log_probs = F.log_softmax(output, dim=2)
                loss = criterion(log_probs, labels, input_lens, label_lens)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f"[Epoch {epoch}] Train Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for images, labels, label_lens in val_loader:
                images, labels = images.to(configs.device), labels.to(configs.device)
                label_lens = label_lens.to(configs.device)
                output = model(images)
                input_lens = torch.full((output.size(1),), output.size(0), dtype=torch.long).to(configs.device)
                log_probs = F.log_softmax(output, dim=2)
                loss = criterion(log_probs, labels, input_lens, label_lens)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f"[Epoch {epoch}] Val Loss: {val_loss:.4f}")
        scheduler.step(val_loss)
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), os.path.join(configs.model_path, "best_model.pth"))
            print("✅ Best model saved.")

train(model, train_loader, val_loader, configs)


Epoch 1:   0%|          | 0/1459 [00:00<?, ?it/s]

  with autocast():


[Epoch 1] Train Loss: 3.0983
[Epoch 1] Val Loss: 2.3975
✅ Best model saved.


Epoch 2:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 2] Train Loss: 1.6307
[Epoch 2] Val Loss: 1.2358
✅ Best model saved.


Epoch 3:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 3] Train Loss: 0.9913
[Epoch 3] Val Loss: 0.9313
✅ Best model saved.


Epoch 4:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 4] Train Loss: 0.7814
[Epoch 4] Val Loss: 0.9026
✅ Best model saved.


Epoch 5:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 5] Train Loss: 0.6780
[Epoch 5] Val Loss: 0.8951
✅ Best model saved.


Epoch 6:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 6] Train Loss: 0.5992
[Epoch 6] Val Loss: 0.6864
✅ Best model saved.


Epoch 7:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 7] Train Loss: 0.5387
[Epoch 7] Val Loss: 0.7655


Epoch 8:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 8] Train Loss: 0.5112
[Epoch 8] Val Loss: 0.7410


Epoch 9:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 9] Train Loss: 0.4705
[Epoch 9] Val Loss: 0.6552
✅ Best model saved.


Epoch 10:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 10] Train Loss: 0.4391
[Epoch 10] Val Loss: 0.6203
✅ Best model saved.


Epoch 11:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 11] Train Loss: 0.4122
[Epoch 11] Val Loss: 0.6345


Epoch 12:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 12] Train Loss: 0.3941
[Epoch 12] Val Loss: 0.5992
✅ Best model saved.


Epoch 13:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 13] Train Loss: 0.3696
[Epoch 13] Val Loss: 0.5343
✅ Best model saved.


Epoch 14:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 14] Train Loss: 0.3559
[Epoch 14] Val Loss: 0.5572


Epoch 15:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 15] Train Loss: 0.3496
[Epoch 15] Val Loss: 0.4830
✅ Best model saved.


Epoch 16:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 16] Train Loss: 0.3244
[Epoch 16] Val Loss: 0.4800
✅ Best model saved.


Epoch 17:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 17] Train Loss: 0.3102
[Epoch 17] Val Loss: 0.4907


Epoch 18:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 18] Train Loss: 0.2939
[Epoch 18] Val Loss: 0.5281


Epoch 19:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 19] Train Loss: 0.2861
[Epoch 19] Val Loss: 0.4569
✅ Best model saved.


Epoch 20:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 20] Train Loss: 0.2748
[Epoch 20] Val Loss: 0.4884


Epoch 21:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 21] Train Loss: 0.2700
[Epoch 21] Val Loss: 0.4720


Epoch 22:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 22] Train Loss: 0.2591
[Epoch 22] Val Loss: 0.5144


Epoch 23:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 23] Train Loss: 0.2496
[Epoch 23] Val Loss: 0.4934


Epoch 24:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 24] Train Loss: 0.2155
[Epoch 24] Val Loss: 0.4265
✅ Best model saved.


Epoch 25:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 25] Train Loss: 0.1931
[Epoch 25] Val Loss: 0.4366


Epoch 26:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 26] Train Loss: 0.1951
[Epoch 26] Val Loss: 0.4711


Epoch 27:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 27] Train Loss: 0.1882
[Epoch 27] Val Loss: 0.4559


Epoch 28:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 28] Train Loss: 0.1806
[Epoch 28] Val Loss: 0.4701


Epoch 29:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 29] Train Loss: 0.1751
[Epoch 29] Val Loss: 0.4320


Epoch 30:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 30] Train Loss: 0.1753
[Epoch 30] Val Loss: 0.4450


Epoch 31:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 31] Train Loss: 0.1828
[Epoch 31] Val Loss: 0.4438


Epoch 32:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 32] Train Loss: 0.1776
[Epoch 32] Val Loss: 0.4426


Epoch 33:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 33] Train Loss: 0.1797
[Epoch 33] Val Loss: 0.4638


Epoch 34:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 34] Train Loss: 0.1797
[Epoch 34] Val Loss: 0.4330


Epoch 35:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 35] Train Loss: 0.1796
[Epoch 35] Val Loss: 0.4357


Epoch 36:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 36] Train Loss: 0.1764
[Epoch 36] Val Loss: 0.4451


Epoch 37:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 37] Train Loss: 0.1775
[Epoch 37] Val Loss: 0.4478


Epoch 38:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 38] Train Loss: 0.1815
[Epoch 38] Val Loss: 0.4633


Epoch 39:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 39] Train Loss: 0.1829
[Epoch 39] Val Loss: 0.4530


Epoch 40:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 40] Train Loss: 0.1763
[Epoch 40] Val Loss: 0.4288


Epoch 41:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 41] Train Loss: 0.1724
[Epoch 41] Val Loss: 0.4327


Epoch 42:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 42] Train Loss: 0.1741
[Epoch 42] Val Loss: 0.4283


Epoch 43:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 43] Train Loss: 0.1779
[Epoch 43] Val Loss: 0.4543


Epoch 44:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 44] Train Loss: 0.1731
[Epoch 44] Val Loss: 0.4386


Epoch 45:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 45] Train Loss: 0.1736
[Epoch 45] Val Loss: 0.4659


Epoch 46:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 46] Train Loss: 0.1809
[Epoch 46] Val Loss: 0.4452


Epoch 47:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 47] Train Loss: 0.1789
[Epoch 47] Val Loss: 0.4337


Epoch 48:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 48] Train Loss: 0.1783
[Epoch 48] Val Loss: 0.4230
✅ Best model saved.


Epoch 49:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 49] Train Loss: 0.1808
[Epoch 49] Val Loss: 0.4573


Epoch 50:   0%|          | 0/1459 [00:00<?, ?it/s]

[Epoch 50] Train Loss: 0.1776
[Epoch 50] Val Loss: 0.4368


In [6]:
def predict_image(image_path, model, configs, transform):
    model.eval()
    idx_to_char = {i: ch for i, ch in enumerate(configs.vocab)}
    blank_idx = len(configs.vocab)

    image = Image.open(image_path).convert("L")
    image = transform(image)
    image = image.unsqueeze(0).to(configs.device)  # (1, 1, H, W)

    with torch.no_grad():
        output = model(image)
        log_probs = F.log_softmax(output, dim=2)
        pred = torch.argmax(log_probs, dim=2)  # (T, 1)
        pred = pred.squeeze(1).tolist()

        decoded = []
        prev_token = None
        for t in pred:
            if t != blank_idx and t != prev_token:
                decoded.append(idx_to_char[t])
            prev_token = t
        return "".join(decoded)


In [12]:
# Rebuild and load model
loaded_model = CRNN(len(configs.vocab)).to(configs.device)
loaded_model.load_state_dict(torch.load(os.path.join(configs.model_path, "best_model.pth"), map_location=configs.device))
print("✅ Model loaded.")

# Predict an image
test_image_path = r"C:\Users\mh738\Downloads\images (2).png"  # Or path to your own image
prediction = predict_image(test_image_path, loaded_model, configs, transform)
print(f"📝 Predicted Text: {prediction}")


✅ Model loaded.
📝 Predicted Text: Family


In [29]:
# TorchScript export
example_input = torch.randn(1, 1, configs.height, 512).to(configs.device)
traced_model = torch.jit.trace(loaded_model, example_input)
torchscript_path = os.path.join(configs.model_path, "model_scripted.pt")
traced_model.save(torchscript_path)
print(f"TorchScript model saved at: {torchscript_path}")


TorchScript model saved at: Models/CRNN\202506090621\model_scripted.pt


In [35]:
# ONNX export
onnx_path = os.path.join(configs.model_path, "model.onnx")
torch.onnx.export(
    loaded_model, 
    example_input, 
    onnx_path,
    input_names=["input"], 
    output_names=["output"], 
    dynamic_axes={"input": {0: "batch_size", 3: "width"}},
    opset_version=11
)
print(f"ONNX model saved at: {onnx_path}")


ONNX model saved at: Models/CRNN\202506090621\model.onnx


In [42]:
import torch
from PIL import Image
import torchvision.transforms as transforms

# Load TorchScript model
model = torch.jit.load(r"Models/CRNN\202506090621\model_scripted.pt")

model.eval()

# Define the transform same as used in training
transform = transforms.Compose([
    transforms.Grayscale(1),
    transforms.Resize((64, 512)),  # Or ResizeKeepAspect if needed
    transforms.ToTensor()
])

def predict_image(image_path):
    image = Image.open(image_path).convert("L")
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        output = model(image)
    pred = output.argmax(dim=2).squeeze().tolist()
    return pred  # decode this to characters using your vocab


In [13]:
import tkinter as tk
from tkinter import filedialog, messagebox
from PIL import Image, ImageTk
import torch
import torchvision.transforms as transforms

# Load TorchScript model
model_path = r"Models/CRNN/202506090621/model_scripted.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    model = torch.jit.load(model_path, map_location=device)
    model.eval()
    print("✅ Model loaded")
except Exception as e:
    print("❌ Error loading model:", e)
    exit()

# Set vocabulary (same used during training)
vocab = " !\"#&'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
idx_to_char = {i: ch for i, ch in enumerate(vocab)}
blank_idx = len(vocab)

# Transform
transform = transforms.Compose([
    transforms.Grayscale(1),
    transforms.Resize((64, 512)),
    transforms.ToTensor()
])

# Predict
def predict_image(path):
    image = Image.open(path).convert("L")
    image = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(image)
        log_probs = torch.nn.functional.log_softmax(output, dim=2)
        pred = torch.argmax(log_probs, dim=2).squeeze().tolist()

        decoded = []
        prev = -1
        for i in pred:
            if i != blank_idx and i != prev:
                decoded.append(idx_to_char.get(i, ""))
            prev = i
        return "".join(decoded)

# GUI
class OCRApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Handwriting OCR App")
        self.root.geometry("800x700")
        self.root.configure(bg="white")

        self.image_path = None
        self.predicted_text = ""

        self.title = tk.Label(root, text="🖋️ Handwriting OCR", font=("Arial", 28, "bold"), bg="white")
        self.title.pack(pady=20)

        self.canvas = tk.Label(root, bg="white")
        self.canvas.pack()

        self.upload_btn = tk.Button(root, text="📁 Upload Image", command=self.upload, font=("Arial", 14), width=20)
        self.upload_btn.pack(pady=10)

        self.predict_btn = tk.Button(root, text="🔍 Predict Text", command=self.predict, font=("Arial", 14), bg="#ccffcc", width=20)
        self.predict_btn.pack(pady=5)

        self.result = tk.Label(root, text="", font=("Courier New", 18), wraplength=700, justify="center", bg="white", fg="#222")
        self.result.pack(pady=20)

        self.export_btn = tk.Button(root, text="📤 Export to File", command=self.export_to_file, font=("Arial", 12), bg="#e0e0ff", width=18)
        self.export_btn.pack(pady=5)

        self.copy_btn = tk.Button(root, text="📋 Copy to Clipboard", command=self.copy_to_clipboard, font=("Arial", 12), bg="#ffd9b3", width=18)
        self.copy_btn.pack(pady=5)

    def upload(self):
        path = filedialog.askopenfilename(filetypes=[("Image files", "*.png *.jpg *.jpeg")])
        if path:
            self.image_path = path
            img = Image.open(path)
            img = img.resize((400, 100))
            self.tk_img = ImageTk.PhotoImage(img)
            self.canvas.configure(image=self.tk_img)
            self.result.config(text="")
            self.predicted_text = ""

    def predict(self):
        if not self.image_path:
            self.result.config(text="❌ Please upload an image first.")
            return

        try:
            prediction = predict_image(self.image_path)
            self.predicted_text = prediction
            self.result.config(text=f"📝 Predicted Text:\n\n{prediction}")
        except Exception as e:
            print("❌ Prediction Error:", e)
            self.result.config(text="❌ An error occurred. Check the console.")

    def export_to_file(self):
        if not self.predicted_text:
            messagebox.showinfo("Info", "No text to export.")
            return
        file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text Files", "*.txt")])
        if file_path:
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(self.predicted_text)
            messagebox.showinfo("Success", "Text exported successfully!")

    def copy_to_clipboard(self):
        if not self.predicted_text:
            messagebox.showinfo("Info", "No text to copy.")
            return
        self.root.clipboard_clear()
        self.root.clipboard_append(self.predicted_text)
        self.root.update()
        messagebox.showinfo("Copied", "Text copied to clipboard!")

# Launch
if __name__ == "__main__":
    root = tk.Tk()
    app = OCRApp(root)
    root.mainloop()


✅ Model loaded
