In [4]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
import nltk
from nltk.corpus import words
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Download nltk words
nltk.download("words")

# Constants
IMG_WIDTH, IMG_HEIGHT = 256, 64
FONT_SIZE = 32
BLANK_CHAR = "_"
MAX_WORD_LEN = 30
DATASET_SIZE = 10000
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Helper function: Generate image with text
def generate_image_with_text(text):
    img = Image.new("L", (IMG_WIDTH, IMG_HEIGHT), color=255)  # Plain white image
    draw = ImageDraw.Draw(img)
    # font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", FONT_SIZE)
    font = ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", FONT_SIZE)
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]
    # text_width, text_height = draw.textsize(text, font=font)
    x = (IMG_WIDTH - text_width) // 2
    y = (IMG_HEIGHT - text_height) // 2
    draw.text((x, y), text, font=font, fill=0)  # Black text
    return np.array(img)

# Task 1: Dataset
def create_dataset():
    all_words = [w for w in words.words() if len(w) <= MAX_WORD_LEN]
    selected_words = random.sample(all_words, DATASET_SIZE)

    images, labels = [], []
    for word in selected_words:
        word_padded = word.ljust(MAX_WORD_LEN, BLANK_CHAR)  # Pad with blank spaces
        img = generate_image_with_text(word)
        images.append(img)
        labels.append(word_padded)

    images = np.array(images).reshape(-1, 1, IMG_HEIGHT, IMG_WIDTH) / 255.0  # Normalize
    return images, labels
# Helper: Create a uniform distribution of words across alphabets
def create_uniform_dataset():
    # Group words by their starting letter
    alphabet_groups = {chr(i): [] for i in range(97, 123)}  # Groups for 'a' to 'z'
    for word in words.words():
        if len(word) <= MAX_WORD_LEN and word[0].isalpha():  # Filter valid words
            first_char = word[0].lower()
            if first_char in alphabet_groups:
                alphabet_groups[first_char].append(word)

    # Sample words uniformly across all alphabets
    words_per_group = DATASET_SIZE // len(alphabet_groups)  # Equal number of words per alphabet
    selected_words = []
    for group, word_list in alphabet_groups.items():
        if len(word_list) >= words_per_group:
            selected_words.extend(random.sample(word_list, words_per_group))
        else:
            selected_words.extend(word_list)  # Include all if not enough words

    # Create dataset with images and padded labels
    images, labels = [], []
    for word in selected_words:
        word_padded = word.ljust(MAX_WORD_LEN, BLANK_CHAR)  # Pad with blank spaces
        img = generate_image_with_text(word)
        images.append(img)
        labels.append(word_padded)

    images = np.array(images).reshape(-1, 1, IMG_HEIGHT, IMG_WIDTH) / 255.0  # Normalize
    return images, labels

# Generate the uniformly distributed dataset
images, labels = create_uniform_dataset()

# images, labels = create_dataset()
train_imgs, val_imgs, train_labels, val_labels = train_test_split(images, labels, test_size=0.2, random_state=42)

# Helper: Create char-to-index and index-to-char mappings
all_chars = list(set("".join(labels)))  # Unique characters
char_to_idx = {c: i for i, c in enumerate(all_chars)}
idx_to_char = {i: c for c, i in char_to_idx.items()}
num_classes = len(all_chars)

# Dataset Class for PyTorch
class OCRDataset(Dataset):
    def __init__(self, images, labels):
        self.images = torch.tensor(images, dtype=torch.float32)
        self.labels = torch.tensor([[char_to_idx[c] for c in label] for label in labels], dtype=torch.long)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]

train_dataset = OCRDataset(train_imgs, train_labels)
val_dataset = OCRDataset(val_imgs, val_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Task 2: Model Architecture
class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        # Encoder (CNN)
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2, 2),  # (32, 32, 128)

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2, 2)  # (64, 16, 64)
        )
        self.fc = nn.Linear(64 * 16 * 64, 128)

        # Decoder (RNN)
        self.rnn = nn.LSTM(input_size=128, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)
        self.output_layer = nn.Linear(256, num_classes)  # 128*2 (bidirectional)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.cnn(x)
        x = x.view(batch_size, -1)  # Flatten
        x = self.fc(x)
        x = x.unsqueeze(1).repeat(1, MAX_WORD_LEN, 1)  # Repeat for sequence length

        x, _ = self.rnn(x)
        x = self.output_layer(x)  # (batch_size, MAX_WORD_LEN, num_classes)
        return x

model = CRNN(num_classes).to(DEVICE)

# Task 3: Training
def train_model(model, train_loader, val_loader, num_epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for imgs, labels in train_loader:
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(imgs)

            loss = 0
            for t in range(MAX_WORD_LEN):
                loss += criterion(outputs[:, t, :], labels[:, t])
            loss /= MAX_WORD_LEN

            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss, correct_chars, total_chars = 0, 0, 0
        model.eval()
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                outputs = model(imgs)

                loss = 0
                for t in range(MAX_WORD_LEN):
                    loss += criterion(outputs[:, t, :], labels[:, t])
                loss /= MAX_WORD_LEN
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=2)
                correct_chars += (preds == labels).sum().item()
                total_chars += labels.numel()

        avg_correct_chars = correct_chars / total_chars

        random_baseline_avg_correct = evaluate_random_baseline(val_loader, char_to_idx)

    # Print Model Performance and Random Baseline Comparison
        print(f"Epoch {epoch + 1}/{num_epochs} | Model Avg Correct Chars: {avg_correct_chars:.4f} | "
          f"Random Baseline Avg Correct Chars: {random_baseline_avg_correct:.4f}")
        print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {train_loss / len(train_loader):.4f} | "
              f"Val Loss: {val_loss / len(val_loader):.4f} | Avg Correct Chars: {avg_correct_chars:.4f}")

        # Display a few predictions
        sample_imgs, sample_labels = next(iter(val_loader))
        sample_imgs, sample_labels = sample_imgs.to(DEVICE), sample_labels.to(DEVICE)
        outputs = model(sample_imgs)
        preds = torch.argmax(outputs, dim=2).cpu().numpy()
        for i in range(5):
            pred_text = "".join(idx_to_char[idx] for idx in preds[i] if idx != char_to_idx[BLANK_CHAR])
            true_text = "".join(idx_to_char[idx] for idx in sample_labels[i].cpu().numpy())
            print(f"Prediction: {pred_text} | Ground Truth: {true_text}")

train_model(model, train_loader, val_loader, num_epochs=10)


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Epoch 1/10 | Train Loss: 1.1461 | Val Loss: 0.9585 | Avg Correct Chars: 0.7209
Prediction: nooooooiiiiii | Ground Truth: malcontentment________________
Prediction: eeaiiii | Ground Truth: weeviled______________________
Prediction: eeeeiiii | Ground Truth: enterozoa_____________________
Prediction: eeee | Ground Truth: xylitol_______________________
Prediction: eeeeiiii | Ground Truth: quindecad_____________________
Epoch 2/10 | Train Loss: 0.9572 | Val Loss: 0.9515 | Avg Correct Chars: 0.7190
Prediction: eeooooooooiie | Ground Truth: malcontentment________________
Prediction: eeeeee | Ground Truth: weeviled______________________
Prediction: eeeeeee | Ground Truth: enterozoa_____________________
Prediction: eeeee | Ground Truth: xylitol_______________________
Prediction: eeeoeeee | Ground Truth: quindecad_____________________
Epoch 3/10 | Train Loss: 0.9379 | Val Loss: 0.9355 | Avg Correct Chars: 0.7222
Prediction: oeerooooiiiiie | Ground Truth: malcontentment________________
Prediction

In [1]:

def evaluate_random_baseline(val_loader, char_to_idx):
    total_correct = 0
    total_chars = 0

    for imgs, labels in val_loader:
        batch_size = labels.size(0)
        random_preds = torch.randint(len(char_to_idx), (batch_size, MAX_WORD_LEN))  # Random predictions

        correct_chars = (random_preds == labels).sum().item()
        total_correct += correct_chars
        total_chars += labels.numel()

    avg_correct_chars = total_correct / total_chars
    print(f"Random Baseline | Avg Correct Chars: {avg_correct_chars:.4f}")
    return avg_correct_chars


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


**Experimenting with Hyperparameters**

In [2]:
# With Dropout and Normalisation

class CRNN(nn.Module):
    def __init__(self, num_classes, hidden_size=128, num_layers=2, dropout_rate=0.3):
        super(CRNN, self).__init__()
        # Encoder (CNN)
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2, 2),  # (32, 32, 128)
            nn.Dropout2d(p=dropout_rate),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2, 2),  # (64, 16, 64)
            nn.Dropout2d(p=dropout_rate),
        )
        self.fc = nn.Linear(64 * 16 * 64, 128)

        # Decoder (RNN)
        self.rnn = nn.LSTM(
            input_size=128,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate,
        )
        self.output_layer = nn.Linear(2 * hidden_size, num_classes)  # 128*2 (bidirectional)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.cnn(x)
        x = x.view(batch_size, -1)  # Flatten
        x = self.fc(x)
        x = x.unsqueeze(1).repeat(1, MAX_WORD_LEN, 1)  # Repeat for sequence length

        x, _ = self.rnn(x)
        x = self.output_layer(x)  # (batch_size, MAX_WORD_LEN, num_classes)
        return x


In [3]:
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.001, weight_decay=1e-5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for imgs, labels in train_loader:
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(imgs)

            loss = 0
            for t in range(MAX_WORD_LEN):
                loss += criterion(outputs[:, t, :], labels[:, t])
            loss /= MAX_WORD_LEN

            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss, correct_chars, total_chars = 0, 0, 0
        model.eval()
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                outputs = model(imgs)

                loss = 0
                for t in range(MAX_WORD_LEN):
                    loss += criterion(outputs[:, t, :], labels[:, t])
                loss /= MAX_WORD_LEN
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=2)
                correct_chars += (preds == labels).sum().item()
                total_chars += labels.numel()

        avg_correct_chars = correct_chars / total_chars
        print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {train_loss / len(train_loader):.4f} | "
              f"Val Loss: {val_loss / len(val_loader):.4f} | Avg Correct Chars: {avg_correct_chars:.4f}")

        # Display a few predictions
        sample_imgs, sample_labels = next(iter(val_loader))
        sample_imgs, sample_labels = sample_imgs.to(DEVICE), sample_labels.to(DEVICE)
        outputs = model(sample_imgs)
        preds = torch.argmax(outputs, dim=2).cpu().numpy()
        for i in range(5):
            pred_text = "".join(idx_to_char[idx] for idx in preds[i] if idx != char_to_idx[BLANK_CHAR])
            true_text = "".join(idx_to_char[idx] for idx in sample_labels[i].cpu().numpy())
            print(f"Prediction: {pred_text} | Ground Truth: {true_text}")


In [5]:
# Experiment with Learning Rate:
model = CRNN(num_classes).to(DEVICE)
train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.01)  # Higher LR

print('Now with a lower Learning Rate')
train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.0001)  # Lower LR

# Experiment with Dropout:
model = CRNN(num_classes, hidden_size=128, num_layers=2, dropout_rate=0.5).to(DEVICE)  # Higher dropout
print('Now with a Higher Dropout')
train_model(model, train_loader, val_loader, num_epochs=10)

# Experiment with LSTM Hidden Size:
model = CRNN(num_classes, hidden_size=256, num_layers=2, dropout_rate=0.3).to(DEVICE)
print('Now with changing LSTM Hidden Size')
train_model(model, train_loader, val_loader, num_epochs=10)

# Experiment with Weight Decay:
print('Now with stronger Regularisation')
train_model(model, train_loader, val_loader, num_epochs=10, weight_decay=1e-4)  # Stronger regularization

Epoch 1/10 | Train Loss: 1.1471 | Val Loss: 1.1058 | Avg Correct Chars: 0.7078
Prediction: poooo | Ground Truth: myodynamic____________________
Prediction: poooo | Ground Truth: wresting______________________
Prediction: poooo | Ground Truth: emydosaurian__________________
Prediction: poooo | Ground Truth: xarque________________________
Prediction: poooo | Ground Truth: querimoniousness______________
Epoch 2/10 | Train Loss: 1.0693 | Val Loss: 1.0452 | Avg Correct Chars: 0.7114
Prediction: ueneoo | Ground Truth: myodynamic____________________
Prediction: uentoo | Ground Truth: wresting______________________
Prediction: ueneoo | Ground Truth: emydosaurian__________________
Prediction: uetooo | Ground Truth: xarque________________________
Prediction: ueneoo | Ground Truth: querimoniousness______________
Epoch 3/10 | Train Loss: 1.0198 | Val Loss: 0.9490 | Avg Correct Chars: 0.7234
Prediction: nanoooiiiii | Ground Truth: myodynamic____________________
Prediction: lantiii | Ground Truth: w