In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

def create_vocab(text):
    return set().union(*map(set, text)) | {'<pad>', '<sos>', '<eos>'}


def load_data(path):
    df = pd.read_csv(path, header = None, names = ['latin','devanagari'])
    return df['latin'], df['devanagari']

latin_train, devanagari_train = load_data('/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_train.csv')
latin_valid, devanagari_valid = load_data('/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_valid.csv')
latin_test, devanagari_test = load_data('/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_test.csv')

#print(latin_train, devanagari_train, end='\n')

latin_vocab = create_vocab(latin_train)
print(latin_vocab)

devanagari_vocab = create_vocab(devanagari_train)
#print(devanagari_vocab)

latin_token2idx = {token:idx for idx, token in enumerate(sorted(latin_vocab))}
#print(latin_token2idx)

devanagari_token2idx = {token:idx for idx, token in enumerate(sorted(devanagari_vocab))}
#print(devanagari_token2idx)

{'z', 'w', 'd', 'p', 'j', '<eos>', 'b', 'c', 'l', 'q', 'k', 'e', 'x', 'u', 'h', 'f', 'a', 'g', 'y', '<pad>', 'n', 'm', 's', 'r', '<sos>', 't', 'v', 'o', 'i'}


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader


class AksharantarDataset(Dataset):
    def __init__(self, latin_words, devanagari_words, latin_token2idx, devanagari_token2idx):
        self.latin_words = latin_words
        self.devanagari_words = devanagari_words
        self.latin_token2idx = latin_token2idx
        self.devanagari_token2idx = devanagari_token2idx

        self.unk_idx = max(devanagari_token2idx.values(), default=-1) + 1

    def __len__(self):
        return len(self.latin_words)

    def __getitem__(self, idx):
        latin_indices = [self.latin_token2idx.get(char, self.unk_idx) for char in self.latin_words.iloc[idx]]
        devanagari_indices = [self.devanagari_token2idx.get(char, self.unk_idx) for char in self.devanagari_words.iloc[idx]]

        devanagari_indices = list(map(self.devanagari_token2idx.get, ['<sos>'])) + devanagari_indices + list(map(self.devanagari_token2idx.get, ['<eos>']))

        return torch.tensor(latin_indices, dtype=torch.long), torch.tensor(devanagari_indices, dtype=torch.long)



def collate_fn(batch):
    # Unpack batch and pad sequences
    latin_padded = pad_sequence([torch.tensor(latin_seq) for latin_seq, _ in batch], batch_first=True, padding_value=latin_token2idx['<pad>'])
    devanagari_padded = pad_sequence([torch.tensor(devanagari_seq) for _, devanagari_seq in batch], batch_first=True, padding_value=devanagari_token2idx['<pad>'])
    return latin_padded, devanagari_padded

    
train_dataset = AksharantarDataset(latin_train, devanagari_train, latin_token2idx, devanagari_token2idx)
train_loader = DataLoader(train_dataset, batch_size = 32, collate_fn = collate_fn, shuffle=True)

valid_dataset = AksharantarDataset(latin_valid, devanagari_valid, latin_token2idx, devanagari_token2idx)
valid_loader = DataLoader(valid_dataset, batch_size = 32, collate_fn = collate_fn, shuffle=True)

test_dataset = AksharantarDataset(latin_test, devanagari_test, latin_token2idx, devanagari_token2idx)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn = collate_fn, shuffle=False)

print(train_dataset[1234])

(tensor([21, 25,  3,  5, 10,  3, 14,  3, 16]), tensor([ 2, 50, 66, 47, 23, 54, 45, 37,  0]))


In [3]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn_cell = rnn_cell.lower()
        
        rnn_class = getattr(nn, self.rnn_cell.upper())
        self.rnn = rnn_class(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, hidden = self.rnn(embedded)
        return hidden

In [4]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5):
        super().__init__()

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        rnn_cell = rnn_cell.lower()
        rnn_class = getattr(nn, rnn_cell.upper())
        self.rnn = rnn_class(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
            
        self.fc = nn.Linear(hidden_size, output_size)
            
    def forward(self, x, hidden=None):
        if hidden is None:
            hidden = self.init_hidden(x.size(0))
            
        embedded = self.dropout(self.embedding(x.unsqueeze(1)))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(self.dropout(output.squeeze(1)))
        return prediction, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if isinstance(self.rnn, nn.LSTM):
            return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                    weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        else:
            return weight.new(self.num_layers, batch_size, self.hidden_size).zero_()


In [5]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size, target_len = target.size()
        target_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size, device=source.device)
        
        encoder_hidden = self.encoder(source)
        decoder_input = target[:, 0]
        
        for t in range(1, target_len):
            decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)
            outputs[:, t] = decoder_output
            teacher_force = (t / target_len) < teaching_force_ratio
            decoder_input = target[:, t] if teacher_force else decoder_output.argmax(dim=1)
            
        return outputs


In [6]:
input_dim = 256
output_dim = 256
enc_emb_dim = 64
dec_emb_dim = 64
hidden_dim = 512
enc_layers = 2
dec_layers = 2
enc_rnn_cell = 'lstm'
dec_rnn_cell = 'lstm'

encoder = Encoder(input_dim, enc_emb_dim, hidden_dim, enc_layers, enc_rnn_cell)
decoder = Decoder(output_dim, dec_emb_dim, hidden_dim, dec_layers, dec_rnn_cell)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model = Seq2Seq(encoder, decoder).to(device)
print(model)

cuda
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(256, 64)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(256, 64)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc): Linear(in_features=512, out_features=256, bias=True)
  )
)


In [7]:
def categorical_accuracy(preds, y, ignore_index):
    mask = y != ignore_index
    correct = preds.argmax(dim=1).eq(y)
    correct_masked = correct[mask]
    return correct_masked.sum().float() / mask.sum().float()


def train(model, iterator, optimizer, criterion, clip, device, ignore_index):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    
    for source, target in iterator:
        source = source.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(source, target)
        
        output_dim = output.shape[-1]
        output = output[:,1:].reshape(-1, output_dim)
        target = target[:, 1:].reshape(-1)
        
        loss = criterion(output, target)
        accuracy = categorical_accuracy(output, target, ignore_index)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
        
    return epoch_loss/len(iterator), epoch_accuracy/len(iterator)

def evaluate(model, iterator, criterion, device, ignore_index):
    model.eval()
    epoch_loss = 0
    epoch_accuracy = 0
    
    with torch.no_grad():
        for source, target in iterator:
            source = source.to(device)
            target = target.to(device)
            
            output = model(source, target, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            target = target[:, 1:].reshape(-1)
            
            loss = criterion(output, target)
            accuracy = categorical_accuracy(output, target, ignore_index)
            
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()
            
    return epoch_loss/len(iterator), epoch_accuracy/len(iterator)

In [8]:
from torch.nn.utils.rnn import pad_sequence
num_epoch = 5
clip = 1
optimizer = torch.optim.Adam(model.parameters())
ignore_index = devanagari_token2idx['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = ignore_index).to(device)

for epoch in range(num_epoch):
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, clip, device, ignore_index)
    valid_loss, valid_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)
    
    print(f'Epoch Number:{epoch+1}')
    print(f'Train Loss: {train_loss:.2f} | Train Accuracy: {train_accuracy:.2f} | Validation Loss: {valid_loss:.2f} | Validation Accuracy: {valid_accuracy:.2f}')
    #print(f'Validation Loss: {valid_loss:.2f} | Validation Accuracy: {valid_accuracy:.2f}')

  latin_padded = pad_sequence([torch.tensor(latin_seq) for latin_seq, _ in batch], batch_first=True, padding_value=latin_token2idx['<pad>'])
  devanagari_padded = pad_sequence([torch.tensor(devanagari_seq) for _, devanagari_seq in batch], batch_first=True, padding_value=devanagari_token2idx['<pad>'])


Epoch Number:1
Train Loss: 2.54 | Train Accuracy: 0.32 | Validation Loss: 2.32 | Validation Accuracy: 0.45
Epoch Number:2
Train Loss: 1.29 | Train Accuracy: 0.62 | Validation Loss: 1.72 | Validation Accuracy: 0.63
Epoch Number:3
Train Loss: 0.89 | Train Accuracy: 0.72 | Validation Loss: 1.68 | Validation Accuracy: 0.66
Epoch Number:4
Train Loss: 0.75 | Train Accuracy: 0.76 | Validation Loss: 1.65 | Validation Accuracy: 0.69
Epoch Number:5
Train Loss: 0.66 | Train Accuracy: 0.79 | Validation Loss: 1.60 | Validation Accuracy: 0.70


In [11]:
'''
import numpy as np

#def decode_indices(indices, idx2token):
#    return ''.join([idx2token[idx] for idx in indices if idx not in (devanagari_token2idx['<pad>'], devanagari_token2idx['<sos>'], devanagari_token2idx['<eos>'])])
def decode_indices(indices, idx2token):
    return ''.join([idx2token.get(idx, '<unk>') for idx in indices if idx not in (devanagari_token2idx['<pad>'], devanagari_token2idx['<sos>'], devanagari_token2idx['<eos>'])])


def predict (model, iterator, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for source, target in iterator:
            source = source.to(device)
            target = target.to(device)
            output = model(source, target, 0)
            output = output.argmax(2)
            source = source.cpu().numpy()
            output = output.cpu().numpy()
            predictions.append((source, output))
    return predictions

latin_idx2token = {idx: char for char, idx in latin_token2idx.items()}
devanagari_idx2token = {idx: char for char, idx in devanagari_token2idx.items()}'''

import numpy as np

def decode_indices(indices, idx2token):
    return ''.join([idx2token.get(idx, '<unk>') for idx in indices if idx not in {devanagari_token2idx['<pad>'], devanagari_token2idx['<sos>'], devanagari_token2idx['<eos>']}])

def predict(model, iterator, device):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for source, target in iterator:
            source, target = source.to(device), target.to(device)
            output = model(source, target, 0)
            output = output.argmax(2)
            source, output = source.cpu().numpy(), output.cpu().numpy()
            predictions.append((source, output))
    
    return predictions

latin_idx2token = {idx: char for char, idx in latin_token2idx.items()}
devanagari_idx2token = {idx: char for char, idx in devanagari_token2idx.items()}


In [12]:
test_predictions = predict(model, test_loader, device)

# Debug prints
print("Number of test predictions:", len(test_predictions))

for source_indices, output_indices in test_predictions:
    for i in range(source_indices.shape[0]):
        input_text = decode_indices(source_indices[i], latin_idx2token)
        predicted_text = decode_indices(output_indices[i], devanagari_idx2token)
        print(f'Input word: {input_text} | predicted word: {predicted_text}')

  latin_padded = pad_sequence([torch.tensor(latin_seq) for latin_seq, _ in batch], batch_first=True, padding_value=latin_token2idx['<pad>'])
  devanagari_padded = pad_sequence([torch.tensor(devanagari_seq) for _, devanagari_seq in batch], batch_first=True, padding_value=devanagari_token2idx['<pad>'])


Number of test predictions: 4096
Input word: thermax | predicted word: रेमक
Input word: sikhaaega | predicted word: शिखाएगा
Input word: learn | predicted word: र्न
Input word: twitters | predicted word: ट्विटरर
Input word: tirunelveli | predicted word: तिरुनेलववलल
Input word: independence | predicted word: इंडेपेंडें
Input word: speshiyon | predicted word: स्पेशियों
Input word: shurooh | predicted word: ूरूह
Input word: kolhapur | predicted word: कोलहापुर
Input word: ajhar | predicted word: रझ
Input word: karaar | predicted word: रार
Input word: anka | predicted word: कान
Input word: wpd | predicted word: प्द
Input word: haashie | predicted word: ाशि
Input word: glendale | predicted word: ग्लेंडल
Input word: udhed | predicted word: धेद
Input word: ekthi | predicted word: तीथी
Input word: idea | predicted word: िडे
Input word: ambikapur | predicted word: अम्बिकापुर
Input word: makerere | predicted word: मेकरर
Input word: saboodaane | predicted word: सबुदाने
Input word: foohadta | predic