In [1]:
import wandb
wandb.login(key = "eb9574fa5b11da36782604ea27df8bf1989ddefd")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmegh_m[0m ([33mmegh_m-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [2]:
import os
os.environ["WANDB_SILENT"] = "false"
os.environ["WANDB_START_METHOD"] = "thread"
os.environ["WANDB_API_KEY"] = "eb9574fa5b11da36782604ea27df8bf1989ddefd"

In [1]:
import numpy as np
import torch
import torch.nn as nn
import random
import torch.nn.functional as F

# Utils

In [2]:
class CharEmbed(nn.Module):
    def __init__(self, input_dim, embed_dim):
        super(CharEmbed, self).__init__()
        self.embed = nn.Embedding(input_dim, embed_dim)
    
    def forward(self, input_seq):
        return self.embed(input_seq)

class EncoderRNN(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, n_layers=1, 
                 cell_type='GRU', dropout=0.0, bidirectional=False):
        super(EncoderRNN, self).__init__()
        self.embed = nn.Embedding(input_dim, embed_dim)
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.cell_type = cell_type
        self.bidirectional = bidirectional #to allow forward and backward time step data processing
        # Cell type options GRU, LSTM & vanilla RNN
        if cell_type == 'GRU':
            self.rnn = nn.GRU(embed_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, bidirectional=bidirectional)
        elif cell_type == 'LSTM':
            self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, bidirectional=bidirectional)
        else: 
            self.rnn = nn.RNN(embed_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, bidirectional=bidirectional)
    
    def forward(self, input_seq, input_lengths, hidden=None):
        # Sort sequences by length
        input_lengths, sort_idx = torch.sort(input_lengths, descending=True)
        input_seq = input_seq[:, sort_idx]  # (seq_len, batch_size, ...)
        
        # Convert to embeddings
        embedded = self.embed(input_seq)
        
        # Pack with enforce_sorted=False
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, 
            input_lengths.cpu(), 
            enforce_sorted=False
        )
        
        # Forward pass
        outputs, hidden = self.rnn(packed, hidden)
        
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        
        # Restore original order
        _, unsort_idx = torch.sort(sort_idx)
        outputs = outputs[:, unsort_idx]
        
        # Handle LSTM hidden/cell states
        if isinstance(hidden, tuple):
            hidden = (
                hidden[0][:, unsort_idx],  # Hidden state
                hidden[1][:, unsort_idx]   # Cell state
            )
        else:  # For GRU/RNN
            hidden = hidden[:, unsort_idx]
        
        return outputs, hidden

class DecoderRNN(nn.Module): #Basically similar to the encoder, will have a softmax to predict next char
    def __init__(self, output_dim, embed_dim, hidden_dim, vocab, n_layers=1, cell_type='GRU', dropout=0.0, go_idx=1, stop_idx=2):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(output_dim, embed_dim)
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.cell_type = cell_type
        self.go_idx = go_idx
        self.stop_idx = stop_idx
        self.vocab = vocab
        if cell_type == 'GRU':
            self.rnn = nn.GRU(embed_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0)
        elif cell_type == 'LSTM':
            self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0)
        else:
            self.rnn = nn.RNN(embed_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0)
        
        self.out = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        # Get embedding of current input character
        embedded = self.embed(input).unsqueeze(0)
        
        # Forward pass through decoder
        output, hidden = self.rnn(embedded, hidden)
        
        # Predict next character probabilities
        output = self.softmax(self.out(output.squeeze(0)))
        
        return output, hidden


In [3]:
class Seq2Seq(nn.Module): #Flexible enough to use different encoders other than the ones we define
    def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, num_layers, cell_type, dropout, device,vocab, go_idx=1, stop_idx=2):
        #super().__init__()
        super(Seq2Seq, self).__init__()
        self.device = device
        self.go_idx = go_idx
        self.stop_idx = stop_idx
        self.vocab = vocab
            # Internal encoder creation
        self.encoder = EncoderRNN(
            input_dim=input_dim,
            embed_dim=embed_dim,
            hidden_dim=hidden_dim,
            n_layers=num_layers,
            cell_type=cell_type,
            dropout=dropout
        )
        
        # Internal decoder creation
        self.decoder = DecoderRNN(
            output_dim=output_dim,
            embed_dim=embed_dim,
            hidden_dim=hidden_dim,
            n_layers=num_layers,
            cell_type=cell_type,
            dropout=dropout,
            vocab = vocab,
            go_idx = go_idx,
            stop_idx = stop_idx
        )
        self.device = device 
    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # Last hidden state of the encoder
        encoder_outputs, hidden = self.encoder(src, src_len)
        
        # First input to the decoder is the <go> token
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            
            # To Decide if we're going to use teacher forcing or not as needed
            teacher_force = random.random() < teacher_forcing_ratio
            
            # Get the highest predicted token from our predictions
            top1 = output.argmax(1)
            
            # If we use teacher forcing, we have to use actual next token as next input
            # If not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs
        
    def beam_search(self, src, src_len, beam_width=5, max_len=50):
        """Batch-friendly beam search implementation"""
        self.eval()
        batch_size = src.size(1)
        
        # Initialize beams with GO token
        beams = torch.full((batch_size * beam_width, max_len), 
                          self.decoder.stop_idx, 
                          device=self.device)
        beams[:, 0] = self.decoder.go_idx
        beam_scores = torch.zeros(batch_size * beam_width, device=self.device)
        
        # Encode source sequence
        encoder_outputs, hidden = self.encoder(src, src_len)
        
        # Expand hidden states
        if isinstance(hidden, tuple):  # LSTM
            hidden = (
                hidden[0].repeat(1, beam_width, 1),
                hidden[1].repeat(1, beam_width, 1)
            )
        else:  # GRU/RNN
            hidden = hidden.repeat(1, beam_width, 1)
        
        for step in range(max_len-1):
            decoder_input = beams[:, step]
            output, hidden = self.decoder.forward(decoder_input, hidden)
            
            log_probs = F.log_softmax(output, dim=1)
            topk_log_probs, topk_indices = torch.topk(log_probs, beam_width, dim=1)
            
            # Reshape scores
            if step == 0:
                # First step: (batch, beam) -> (batch, beam*beam)
                expanded_scores = topk_log_probs.view(batch_size, -1)
            else:
                # Subsequent steps: (batch, beam, beam) -> (batch, beam*beam)
                expanded_scores = beam_scores.view(batch_size, beam_width, 1) + \
                                 topk_log_probs.view(batch_size, beam_width, beam_width)
                expanded_scores = expanded_scores.view(batch_size, -1)
            
            # Select top candidates
            top_scores, top_indices = torch.topk(expanded_scores, beam_width, dim=1)
            
            # Calculate beam/token origins
            beam_indices = top_indices // beam_width
            token_indices = top_indices % beam_width
            
            # Update beams with CORRECT indices
            beams = beams.view(batch_size, beam_width, -1)
            beams = torch.cat([
                beams[torch.arange(batch_size)[:, None], beam_indices],
                token_indices.unsqueeze(-1)  # Correct index usage
            ], dim=-1)
            beams = beams.view(batch_size * beam_width, -1)
            
            # Update scores and hidden states
            beam_scores = top_scores.view(-1)
            if isinstance(hidden, tuple):
                hidden = (
                    hidden[0][:, beam_indices.view(-1), :].contiguous(),
                    hidden[1][:, beam_indices.view(-1), :].contiguous()
                )
            else:
                hidden = hidden[:, beam_indices.view(-1), :].contiguous()
            
            # Early stopping check
            current_tokens = beams[:, step+1]
            if (current_tokens == self.decoder.stop_idx).all():
                break
    
        return self._process_beams(beams.view(batch_size, beam_width, -1))

    def _process_beams(self, beams_tensor):
        """
        Converts beam search output tensor into cleaned token sequences.
        
        Args:
            beams_tensor: Tensor of shape (batch_size, beam_width, max_len)
            
        Returns:
            List[List[List[str]]]: For each batch item, a list of beam sequences.
        """
        batch_size, beam_width, max_len = beams_tensor.size()
        processed_beams = []
        
        for batch_idx in range(batch_size):
            batch_sequences = []
            for beam_idx in range(beam_width):
                # Extract token indices for this beam
                indices = beams_tensor[batch_idx, beam_idx].tolist()
                
                # Remove <go> (go_idx) at the start if present
                if indices[0] == self.decoder.go_idx:
                    indices = indices[1:]  # Remove first element
                
                # Truncate at first <stop> (stop_idx)
                try:
                    stop_pos = indices.index(self.decoder.stop_idx)
                    indices = indices[:stop_pos]  # Exclude <stop>
                except ValueError:
                    pass  # No <stop> found, use all tokens
                
                # Remove padding (assuming pad_idx = 0)
                cleaned_indices = [idx for idx in indices if idx not in [0, self.decoder.go_idx, self.decoder.stop_idx]]
                
                # Convert indices to tokens
                tokens = [self.decoder.vocab.idx2char[idx] for idx in cleaned_indices]
                batch_sequences.append(tokens)
            
            processed_beams.append(batch_sequences)
        
        return processed_beams



# Dataset Loading & Preprocessing

In [4]:
import os
import tarfile
import requests
import pandas as pd
from io import BytesIO
from collections import defaultdict
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import wandb

# Dataset Configuration
DATASET_URL = "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"
DATA_DIR = "./dakshina_dataset"
HI_LEXICON_DIR = os.path.join(DATA_DIR,"dakshina_dataset_v1.0", "hi", "lexicons") #For Hindi (Chosen Language)

def download_and_extract_dataset(): #Scripted Dataset Download
    if not os.path.exists(DATA_DIR):
        print("Downloading dataset...")
        response = requests.get(DATASET_URL)
        file = tarfile.open(fileobj=BytesIO(response.content))
        file.extractall(DATA_DIR)
        print("Dataset extracted successfully")

class TransliterationVocabulary: #Build Character Vocab and add go,stop, padding and unknown tokens
    def __init__(self):
        self.char2idx = defaultdict(lambda: len(self.char2idx))
        self.idx2char = {}
        self.special_tokens = ['<pad>', '<go>', '<stop>', '<unk>']
        
        # Initialize special tokens
        for token in self.special_tokens:
            self.char2idx[token]
        
        self.idx2char = {v: k for k, v in self.char2idx.items()}
    
    def add_word(self, word):
        #print(word) #for debugging
        for char in word:
            self.char2idx[char]
        self.idx2char = {v: k for k, v in self.char2idx.items()}

class TransliterationDataset(Dataset): #Dataset loader for Hindi
    def __init__(self, split='train'):
        self.split = split
        self.data = self._load_data()
        self.src_vocab = TransliterationVocabulary()
        self.trg_vocab = TransliterationVocabulary()
        
        # Build vocabularies
        for src,trg in self.data:
            self.src_vocab.add_word(src)
            self.trg_vocab.add_word(trg)
    
    def _load_data(self):
        """Load data from TSV files and filter non-string entries"""
        file_map = {
            'train': 'hi.translit.sampled.train.tsv',
            'dev': 'hi.translit.sampled.dev.tsv',
            'test': 'hi.translit.sampled.test.tsv'
        }
        
        df = pd.read_csv(
            os.path.join(HI_LEXICON_DIR, file_map[self.split]),
            sep='\t', 
            header=None,
            names=['devanagari', 'latin', 'count'],
            dtype={'latin': str, 'devanagari': str, 'count':int}  # Force string type
        )
        
        # Filter out non-string entries and empty strings
        valid_entries = [
            (latin, devanagari) 
            for latin, devanagari in zip(df['latin'], df['devanagari'])
            if (isinstance(latin, str) and 
                isinstance(devanagari, str) and
                len(latin) > 0 and 
                len(devanagari) > 0)
        ]
        
        return valid_entries

    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        src, trg = self.data[idx]
        return (
            [self.src_vocab.char2idx['<go>']] + 
            [self.src_vocab.char2idx[c] for c in src if c not in ['<go>','<stop>','<pad>','<unk>']] +
            [self.src_vocab.char2idx['<stop>']],
            [self.trg_vocab.char2idx['<go>']] + 
            [self.trg_vocab.char2idx[c] for c in trg if c not in ['<go>','<stop>','<pad>','<unk>']] +
            [self.trg_vocab.char2idx['<stop>']]
        )

def collate_fn(batch): #Padding and Masking
    src_batch, trg_batch = zip(*batch)
    
    src_lens = torch.tensor([len(x) for x in src_batch])
    trg_lens = torch.tensor([len(x) for x in trg_batch])
    
    src_pad = pad_sequence(
        [torch.tensor(x) for x in src_batch],
        padding_value=0  # <pad> token index
    )
    
    trg_pad = pad_sequence(
        [torch.tensor(x) for x in trg_batch],
        padding_value=0  # <pad> token index
    )
    
    return src_pad, trg_pad, src_lens, trg_lens

def get_dataloaders(batch_size=64):
    """Create train, dev, test dataloaders"""
    download_and_extract_dataset()
    
    train_dataset = TransliterationDataset('train')
    dev_dataset = TransliterationDataset('dev')
    test_dataset = TransliterationDataset('test')
    
    return (
        DataLoader(train_dataset, batch_size=batch_size, 
                  shuffle=True, collate_fn=collate_fn),
        DataLoader(dev_dataset, batch_size=batch_size, 
                 collate_fn=collate_fn),
        DataLoader(test_dataset, batch_size=batch_size,
                 collate_fn=collate_fn),
        train_dataset.src_vocab,
        train_dataset.trg_vocab
    )


In [42]:

df = pd.read_csv(
            os.path.join(HI_LEXICON_DIR,'hi.translit.sampled.train.tsv'),
            sep='\t',  #specifying seperator
            header=None,
            names=['devanagari','latin','syllables']
        )
list(zip(df['latin'], df['devanagari']))

44204

In [7]:
import torch
from jiwer import cer, wer
from jiwer import visualize_alignment

class TransliterationMetrics:
    @staticmethod
    def preprocess_sequence(indices, vocab, remove_special=True):
        """Convert index tensor to cleaned character sequence"""
        chars = []
        for idx in indices:
            char = vocab.idx2char[idx]
            if remove_special and char in ['<go>', '<stop>', '<pad>','<unk>']:
                continue
            chars.append(char)
        return ''.join(chars)

def evaluate_cer(model, loader, device, beam_width=5):
    """Calculate Character Error Rate"""
    model.eval()
    total_cer = 0.0
    total = 0
    
    with torch.no_grad():
        for src, trg, src_lens, trg_lens in loader:
            src = src.to(device)
            
            # Get beam search predictions
            beam_outputs = model.beam_search(src, src_lens, beam_width)
            
            # Process batch
            for i in range(src.size(1)):
                # Get target sequence
                target_indices = trg[1:trg_lens[i]-1, i].cpu().tolist()
                target_str = TransliterationMetrics.preprocess_sequence(target_indices, model.decoder.vocab)
                
                # Get top prediction
                pred_indices = beam_outputs[i][0]
                pred_str = TransliterationMetrics.preprocess_sequence(pred_indices, model.decoder.vocab)
                
                # Calculate CER
                if target_str:  # Handle empty targets
                    total_cer += cer(target_str, pred_str)
                    total += 1
                else:
                    total_cer += 1.0  # Penalize completely wrong predictions
                    total += 1
                
    return total_cer / total if total > 0 else 0

def evaluate_wer(model, loader, device, beam_width=5):
    """Calculate Word Error Rate (for reference)"""
    model.eval()
    total_wer = 0.0
    total = 0
    
    with torch.no_grad():
        for src, trg, src_lens, trg_lens in loader:
            src = src.to(device)
            
            beam_outputs = model.beam_search(src, src_lens, beam_width)
            
            for i in range(src.size(1)):
                target_indices = trg[1:trg_lens[i]-1, i].cpu().tolist()
                target_str = TransliterationMetrics.preprocess_sequence(target_indices, model.decoder.vocab)
                
                pred_indices = beam_outputs[i][0]
                pred_str = TransliterationMetrics.preprocess_sequence(pred_indices, model.decoder.vocab)
                
                if target_str:
                    total_wer += wer(target_str, pred_str)
                    total += 1
                else:
                    total_wer += 1.0
                    total += 1
                
    return total_wer / total if total > 0 else 0

ModuleNotFoundError: No module named 'jiwer'

# Train Sweep

In [8]:
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_char_err', 'goal': 'minimize'},
    'parameters': {
        'embedding_size': {'values': [64, 128, 256]},
        'hidden_size': {'values': [128, 256, 512]},
        'num_layers': {'values': [1, 2, 3]},
        'cell_type': {'values': ['LSTM', 'GRU', 'RNN']},
        'dropout': {'values': [0.2, 0.3]},
        'learning_rate': {'values': [0.001, 0.0005, 0.0001]},
        'batch_size': {'values': [32, 64, 128]},
        'bidirectional':{'values':[True, False]}
    }
}

In [5]:
def train(config=None):
    with wandb.init(project="DA6401_A3",settings=wandb.Settings(start_method="thread",_disable_stats=True), config = config) as run:
        config = run.config
        
        # Get dataloaders and vocabularies
        train_loader, dev_loader, _, src_vocab, trg_vocab = get_dataloaders(
            batch_size=config.batch_size
        )
        go_idx = trg_vocab.char2idx['<go>']
        stop_idx = trg_vocab.char2idx['stop']
        # Initialize model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        model = Seq2Seq(
            input_dim=len(src_vocab.char2idx),
            output_dim=len(trg_vocab.char2idx),
            embed_dim=config.embedding_size,
            hidden_dim=config.hidden_size,
            num_layers=config.num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout,
            device=device,
            go_idx = go_idx,
            stop_idx = stop_idx,
            vocab = trg_vocab
        ).to(device)
        
        # Training setup
        optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
        criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
        
        # Training loop
        for epoch in range(15):  # Fixed epoch count for sweep
            model.train()
            total_loss = 0
            
            for src, trg, src_lens, trg_lens in train_loader:
                src = src.to(device)
                trg = trg.to(device)
                
                optimizer.zero_grad()
                output = model(src, src_lens, trg)
                
                # Calculate loss
                output_dim = output.shape[-1]
                output = output[1:].view(-1, output_dim)
                trg = trg[1:].view(-1)
                
                loss = criterion(output, trg)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
                optimizer.step()
                
                total_loss += loss.item()
            file_path = os.path.join(wandb.run.dir, "model.pth")
            torch.save(model.state_dict(), file_path)
            wandb.save('model.pth')
            # Validation
            #val_cer = evaluate_cer(model, dev_loader, device)
            #val_wer = evaluate_wer(model, dev_loader, device)
            val_acc = evaluate(model, dev_loader, device)
            wandb.log({
                'epoch': epoch,
                'train_loss': total_loss/len(train_loader),
                'val_acc': val_acc
            })

def evaluate(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for src, trg, src_lens, trg_lens in loader:
            src = src.to(device)
            trg = trg.to(device)
            
            outputs = model(src, src_lens, trg, 0)  # No teacher forcing
            outputs = outputs.argmax(dim=-1)
            
            # Calculate accuracy
            mask = (trg != 0)
            correct += ((outputs == trg) * mask).sum().item()
            total += mask.sum().item()
    
    return correct / total

In [9]:
def train_with_beam(config=None):
    with wandb.init(project="DA6401_A3",settings=wandb.Settings(start_method="thread",_disable_stats=True), config = config) as run:
        config = run.config
        
        # Get dataloaders and vocabularies
        train_loader, dev_loader, _, src_vocab, trg_vocab = get_dataloaders(
            batch_size=config.batch_size
        )
        go_idx = trg_vocab.char2idx['<go>']
        stop_idx = trg_vocab.char2idx['stop']
        # Initialize model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        model = Seq2Seq(
            input_dim=len(src_vocab.char2idx),
            output_dim=len(trg_vocab.char2idx),
            embed_dim=config.embedding_size,
            hidden_dim=config.hidden_size,
            num_layers=config.num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout,
            device=device,
            go_idx = go_idx,
            stop_idx = stop_idx,
            vocab = trg_vocab
        ).to(device)
        
        # Training setup
        optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
        criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
        
        # Training loop
        for epoch in range(15):  # Fixed epoch count for sweep
            model.train()
            total_loss = 0
            
            for src, trg, src_lens, trg_lens in train_loader:
                src = src.to(device)
                trg = trg.to(device)
                
                optimizer.zero_grad()
                output = model(src, src_lens, trg)
                
                # Calculate loss
                output_dim = output.shape[-1]
                output = output[1:].view(-1, output_dim)
                trg = trg[1:].view(-1)
                
                loss = criterion(output, trg)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
                optimizer.step()
                
                total_loss += loss.item()
            file_path = os.path.join(wandb.run.dir, "model.pth")
            torch.save(model.state_dict(), file_path)
            wandb.save('model.pth')
            # Validation
            val_cer = evaluate_cer(model, dev_loader, device)
            val_wer = evaluate_wer(model, dev_loader, device)
            #val_acc = evaluate(model, dev_loader, device)
            wandb.log({
                'epoch': epoch,
                'train_loss': total_loss/len(train_loader),
                'val_char_err': val_cer,
                'val_word_err': val_wer
            })

In [6]:
train_loader, dev_loader, test_loader, src_vocab, trg_vocab = get_dataloaders()
print(f"Source vocab size: {len(src_vocab.char2idx)}")
print(f"Target vocab size: {len(trg_vocab.char2idx)}")
print(f"Training batches: {len(train_loader)}")

Downloading dataset...
Dataset extracted successfully
Source vocab size: 30
Target vocab size: 67
Training batches: 691


In [56]:
TransliterationDataset('test').data[2][1]

'ं'

In [9]:
src_vocab.char2idx

defaultdict(<function __main__.TransliterationVocabulary.__init__.<locals>.<lambda>()>,
            {'<pad>': 0,
             '<go>': 1,
             '<stop>': 2,
             '<unk>': 3,
             'a': 4,
             'n': 5,
             'k': 6,
             'g': 7,
             'i': 8,
             't': 9,
             'u': 10,
             'c': 11,
             'l': 12,
             'e': 13,
             'r': 14,
             's': 15,
             'h': 16,
             'd': 17,
             'b': 18,
             'y': 19,
             'o': 20,
             'j': 21,
             'z': 22,
             'm': 23,
             'v': 24,
             'w': 25,
             'p': 26,
             'f': 27,
             'x': 28,
             'q': 29})

In [8]:
src_vocab.idx2char

{0: '<pad>',
 1: '<go>',
 2: '<stop>',
 3: '<unk>',
 4: 'a',
 5: 'n',
 6: 'k',
 7: 'g',
 8: 'i',
 9: 't',
 10: 'u',
 11: 'c',
 12: 'l',
 13: 'e',
 14: 'r',
 15: 's',
 16: 'h',
 17: 'd',
 18: 'b',
 19: 'y',
 20: 'o',
 21: 'j',
 22: 'z',
 23: 'm',
 24: 'v',
 25: 'w',
 26: 'p',
 27: 'f',
 28: 'x',
 29: 'q'}

In [11]:
'''# Initialize sweep
sweep_id = wandb.sweep(
    sweep=sweep_config,  # Your sweep configuration dictionary
    project="DA6401_A3",
    entity="megh_m-iit-madras"
)

# Run sweep agents
wandb.agent(sweep_id, function=train, count = 10)'''

Create sweep with ID: ifzck3iv
Sweep URL: https://wandb.ai/megh_m-iit-madras/DA6401_A3/sweeps/ifzck3iv


[34m[1mwandb[0m: Agent Starting Run: ya29hggg with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁
val_acc,█▇▆▃▄▄▅▃▃▃▁▂▁▂▃

0,1
epoch,14.0
train_loss,0.28027
val_acc,0.13921


[34m[1mwandb[0m: Agent Starting Run: v3nstyub with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▃▂▂▂▂▁▁▁▁▁
val_acc,▅▁▃▆█▇█▆▇▆▅▆▄▆▇

0,1
epoch,14.0
train_loss,0.93551
val_acc,0.14853


[34m[1mwandb[0m: Agent Starting Run: ux42uzil with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁
val_acc,█▇▃▆▂▄▅▃▃▁▄▃▂▃▃

0,1
epoch,14.0
train_loss,0.52017
val_acc,0.13814


[34m[1mwandb[0m: Agent Starting Run: 4ps4y0zs with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_acc,▁▆▅▅▃▁▅█▅▃▇▄▂▁█

0,1
epoch,14.0
train_loss,0.74028
val_acc,0.14253


[34m[1mwandb[0m: Agent Starting Run: tjf10scf with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁
val_acc,█▇▅▄▆▄▂▂▂▁▃▃▂▁▂

0,1
epoch,14.0
train_loss,0.31967
val_acc,0.13318


[34m[1mwandb[0m: Agent Starting Run: qzgkeeka with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▆▅▅▄▄▃▃▃▂▂▂▁▁▁
val_acc,█▁▃▄▂▆▅▆▇█▅▆▆▆▅

0,1
epoch,14.0
train_loss,2.02461
val_acc,0.1183


[34m[1mwandb[0m: Agent Starting Run: 26y2nwgk with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▁▁▁▁▁▁
val_acc,▃▃▂█▆▄▃▂▂▂▃▁▁▁▃

0,1
epoch,14.0
train_loss,0.49272
val_acc,0.13987


[34m[1mwandb[0m: Agent Starting Run: 5mj291g6 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▇▆▅▄▃▃▃▂▂▂▂▁▁▁
val_acc,▃▁▄▂▅▃▄▄▅▃█▆▆▆█

0,1
epoch,14.0
train_loss,1.39874
val_acc,0.13246


[34m[1mwandb[0m: Agent Starting Run: 12a1zp53 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▆▅▄▃▃▂▂▂▂▂▁▁▁▁
val_acc,▁▅▄▇█▅▇▇█▆█▇▄█▇

0,1
epoch,14.0
train_loss,0.72946
val_acc,0.13313


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: uf81l0g4 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▆▅▄▄▄▃▃▂▂▂▂▁▁▁
val_acc,▃▁▂▃▄▁█▇▅▆▆▇█▇▂

0,1
epoch,14.0
train_loss,1.2443
val_acc,0.12701


In [13]:
def show_alignment_example(model, sample):
    src, trg = sample
    pred = model.beam_search(src.unsqueeze(1), [len(src)], beam_width=1)[0][0]
    
    target_str = TransliterationMetrics.preprocess_sequence(trg[1:-1], model.decoder.vocab)
    pred_str = TransliterationMetrics.preprocess_sequence(pred, model.decoder.vocab)
    
    print("CER:", cer(target_str, pred_str))
    print("Target:", target_str)
    print("Predicted:", pred_str)
    visualize_alignment(target_str, pred_str)

In [14]:
# For analysis
'''sample = next(iter(dev_loader))
show_alignment_example(model, sample[0][0], sample[1][0])'''

NameError: name 'model' is not defined

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model = Seq2Seq(
            input_dim=len(src_vocab.char2idx),
            output_dim=len(trg_vocab.char2idx)+1,
            embed_dim=128,
            hidden_dim=256,
            num_layers=1,
            cell_type='LSTM',
            dropout=0.2,
            device=device,
            go_idx = trg_vocab.char2idx['<go>'],
            stop_idx = trg_vocab.char2idx['<stop>'],
            vocab = trg_vocab
        ).to(device)
best_model.load_state_dict(torch.load('/kaggle/input/model_no_beam/pytorch/default/1/model_no-beam.pth', map_location = torch.device('cpu'), weights_only = True))
best_model.eval()
evaluate(best_model, test_loader, device)

0.13594314079422382

In [24]:
with torch.no_grad():
        for src, trg, src_lens, trg_lens in test_loader:
            src = src.to(device)
            trg = trg.to(device)
            
            outputs = best_model(src, src_lens, trg, 0)  # No teacher forcing
            #outputs = outputs.argmax(dim=-1)

outputs.size()

torch.Size([3, 22, 15])

In [9]:
results = []
with torch.no_grad():
    for src, trg, src_lens, trg_lens in test_loader:
        src = src.to(device)
        
        # Forward pass through encoder
        encoder_outputs, hidden = best_model.encoder(src, src_lens)
        
        # Greedy decoding
        batch_size = src.size(1)
        decoder_input = torch.full((1, batch_size), best_model.decoder.go_idx, device=device, dtype=torch.long)
        
        predictions = torch.zeros(50, batch_size, device=device, dtype=torch.long)

        for t in range(50):
            decoder_output, hidden = best_model.decoder(decoder_input.squeeze(0), hidden)
            topi = decoder_output.argmax(1)
            predictions[t] = topi
            decoder_input = topi.unsqueeze(0)

        # Process batch
        for i in range(batch_size):
            # Get source sequence
            src_indices = src[:,i].cpu().numpy()
            src_str = ''.join([src_vocab.idx2char[idx] for idx in src_indices if idx not in [0,1,2,3]])
            
            # Get prediction
            pred_indices = predictions[:,i].cpu().numpy()
            pred_str = ''.join([trg_vocab.idx2char[idx] for idx in pred_indices if idx not in [0,1,2,3]])
            
            results.append({
                'Source': src_str,
                'Prediction': pred_str,
                'Target': ''.join([trg_vocab.idx2char[idx] for idx in trg[:,i].cpu().numpy() if idx not in [0,1,2,3]])
            })

pd.DataFrame(results)

Unnamed: 0,Source,Prediction,Target
0,ank,अंक,अंक
1,anka,अंका,अंक
2,ankgi,अंगीक,अंकगण
3,anaktn,अंकतन,अंकिं
4,ankutn,अंकुटन,अंकिं
...,...,...,...
4497,utzbgnc,उज्टबंग,ैिजरचगंल
4498,utvuancayaab,उत्वााबां,ैिईंलुीुा
4499,utvuancayab,उत्वांबा,ैिईंलुीुा
4500,utvirv,उत्ववरी,ैिडरवूड


In [57]:
src_vocab.char2idx['a']

4

In [67]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with torch.no_grad():
    for src, trg, src_lens, trg_lens in test_loader:
        src = src.to(device)
        trg = trg.to(device)
        for i in range(1):
            # Get source sequence
            src_indices = src[:,i].cpu().numpy()
            src_str = ''.join([src_vocab.idx2char[idx] for idx in src_indices if idx not in [0, 1, 2, 3]])
            print(src_str) 

ank
aiiuav
ayyaavg
azg
goj
tlbrl
rkaiiug
diuaoaiak
dzauana
rwalmr
tvaka
kayl
xasg
ktti
jzrlgv
car
cazaqgtn
juanblahatn
jughaig
juudoakal
eahdag
eaankaltn
edbag
epaltn
itnca
bdznr
ialaycane
ireaeg
balla
bgmwa
buaiana
nakvazgwtn
nad
nglyaabu
oanjuaanc
oalaaburrn
oaiuwaklah
ogbgwa
oalailttorl
olantn
fazia
ftni
yabaia
yanyg
ygccrvi
yanbacr
yuayutti
yuaglt
hawtlkaa
hancang
hdyalkodl
habr
waina
laae
lrria
ltwazv
zazkalia
zrikal
mrlvgtn
mgbwaawrgn
mrltngka
vulaytn
vulglah
vanviugia
valkalrgn
vawan
vugrzb
vrnbua
viltkrv
uahazamal
udooal


In [68]:
src_str

'udooal'

In [77]:
src, trg = TransliterationDataset('test').data[2]
for c in src:
    print(c)

a
n
k
i
t


In [78]:
[src_vocab.char2idx['<go>']] + [src_vocab.char2idx[c] for c in src if c not in ['<go>','<stop>','<pad>','<unk>']] +[src_vocab.char2idx['<stop>']]


[1, 4, 5, 6, 8, 9, 2]

# With Attention