In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
'''
def create_vocab(text):
    vocab = set(char for word in text for char in word)
    vocab.add('<pad>')
    vocab.add('<sos>') #start of sequence
    vocab.add('<eos>') #end of sequence
    return vocab'''
def create_vocab(text):
    return set().union(*map(set, text)) | {'<pad>', '<sos>', '<eos>'}


def load_data(path):
    df = pd.read_csv(path, header = None, names = ['latin','devanagari'])
    return df['latin'], df['devanagari']

latin_train, devanagari_train = load_data('/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_train.csv')
latin_valid, devanagari_valid = load_data('/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_valid.csv')
latin_test, devanagari_test = load_data('/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_test.csv')

#print(latin_train, devanagari_train, end='\n')

latin_vocab = create_vocab(latin_train)
print(latin_vocab)

devanagari_vocab = create_vocab(devanagari_train)
#print(devanagari_vocab)

latin_token2idx = {token:idx for idx, token in enumerate(sorted(latin_vocab))}
#print(latin_token2idx)

devanagari_token2idx = {token:idx for idx, token in enumerate(sorted(devanagari_vocab))}
#print(devanagari_token2idx)

{'s', 'p', 'r', 'n', 'e', 'a', 'v', 'i', 'z', 'u', 'm', '<pad>', 'b', 'q', 'y', 'f', 'w', 'd', 'l', '<eos>', 'c', 't', 'o', 'g', '<sos>', 'k', 'j', 'x', 'h'}


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
'''
class AksharantarDataset(Dataset):
    def __init__(self, latin_words, devanagari_words, latin_token2idx, devanagari_token2idx):
        self.latin_words = latin_words
        self.devanagari_words = devanagari_words
        self.latin_token2idx = latin_token2idx
        self.devanagari_token2idx = devanagari_token2idx

        # Determine the index for the '<unk>' token
        self.unk_idx = max(devanagari_token2idx.values()) + 1

    def __len__(self):
        return len(self.latin_words)

    def __getitem__(self, idx):
        latin_word = self.latin_words.iloc[idx]
        devanagari_word = self.devanagari_words.iloc[idx]

        # Convert Latin word to indices
        latin_indices = [latin_token2idx[char] for char in latin_word]

        # Convert Devanagari word to indices
        devanagari_indices = []
        for char in devanagari_word:
            # Handle characters not present in devanagari_token2idx
            if char in devanagari_token2idx:
                devanagari_indices.append(devanagari_token2idx[char])
            else:
                devanagari_indices.append(self.unk_idx)  # Assign '<unk>' token index

        # Add <sos> and <eos> tokens
        devanagari_indices = [devanagari_token2idx['<sos>']] + devanagari_indices + [devanagari_token2idx['<eos>']]

        return torch.tensor(latin_indices, dtype=torch.long), torch.tensor(devanagari_indices, dtype=torch.long)'''

class AksharantarDataset(Dataset):
    def __init__(self, latin_words, devanagari_words, latin_token2idx, devanagari_token2idx):
        self.latin_words = latin_words
        self.devanagari_words = devanagari_words
        self.latin_token2idx = latin_token2idx
        self.devanagari_token2idx = devanagari_token2idx

        # Determine the index for the '<unk>' token
        self.unk_idx = max(devanagari_token2idx.values(), default=-1) + 1

    def __len__(self):
        return len(self.latin_words)

    def __getitem__(self, idx):
        latin_indices = [self.latin_token2idx.get(char, self.unk_idx) for char in self.latin_words.iloc[idx]]
        devanagari_indices = [self.devanagari_token2idx.get(char, self.unk_idx) for char in self.devanagari_words.iloc[idx]]

        # Add <sos> and <eos> tokens
        devanagari_indices = list(map(self.devanagari_token2idx.get, ['<sos>'])) + devanagari_indices + list(map(self.devanagari_token2idx.get, ['<eos>']))

        return torch.tensor(latin_indices, dtype=torch.long), torch.tensor(devanagari_indices, dtype=torch.long)


''' 
def collate_fn(batch):
    latin, devanagari = zip(*batch)  # Unpack batch
    latin_padded = pad_sequence(latin, batch_first=True, padding_value=latin_token2idx['<pad>'])
    devanagari_padded = pad_sequence(devanagari, batch_first=True, padding_value=devanagari_token2idx['<pad>'])
    return latin_padded, devanagari_padded'''

def collate_fn(batch):
    # Unpack batch and pad sequences
    latin_padded = pad_sequence([torch.tensor(latin_seq) for latin_seq, _ in batch], batch_first=True, padding_value=latin_token2idx['<pad>'])
    devanagari_padded = pad_sequence([torch.tensor(devanagari_seq) for _, devanagari_seq in batch], batch_first=True, padding_value=devanagari_token2idx['<pad>'])
    return latin_padded, devanagari_padded

    
train_dataset = AksharantarDataset(latin_train, devanagari_train, latin_token2idx, devanagari_token2idx)
train_loader = DataLoader(train_dataset, batch_size = 32, collate_fn = collate_fn, shuffle=True)

valid_dataset = AksharantarDataset(latin_valid, devanagari_valid, latin_token2idx, devanagari_token2idx)
valid_loader = DataLoader(valid_dataset, batch_size = 32, collate_fn = collate_fn, shuffle=True)

test_dataset = AksharantarDataset(latin_test, devanagari_test, latin_token2idx, devanagari_token2idx)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn = collate_fn, shuffle=False)

print(train_dataset[5869])

(tensor([18,  3, 18,  3, 13, 23, 20,  3]), tensor([ 2, 38, 54, 38, 54, 18, 57, 44, 54,  0]))


In [3]:
'''
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell = 'lstm', dropout = 0.5):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings = input_size, embedding_dim = embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
        elif rnn_cell.lower == 'gru':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
        else:
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
            
    def forward(self,x):
        embedded = self.dropout(self.embedding(x))
        outputs, hidden = self.rnn(embedded)
        return hidden'''

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn_cell = rnn_cell.lower()
        
        rnn_class = getattr(nn, self.rnn_cell.upper())
        self.rnn = rnn_class(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, hidden = self.rnn(embedded)
        return hidden

In [4]:
'''
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, rnn_cell = 'lstm', dropout = 0.5):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings = output_size, embedding_dim = embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
        elif rnn_cell.lower == 'gru':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
        else:
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
            
        self.fc = nn.Linear(hidden_size, output_size)
            
    def forward(self,x, hidden):
        x = x.unsqueeze(1)
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(self.dropout(output.squeeze(1)))
        return prediction, hidden'''

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5):
        super().__init__()

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        rnn_cell = rnn_cell.lower()
        rnn_class = getattr(nn, rnn_cell.upper())
        self.rnn = rnn_class(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
            
        self.fc = nn.Linear(hidden_size, output_size)
            
    def forward(self, x, hidden=None):
        if hidden is None:
            hidden = self.init_hidden(x.size(0))
            
        embedded = self.dropout(self.embedding(x.unsqueeze(1)))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(self.dropout(output.squeeze(1)))
        return prediction, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if isinstance(self.rnn, nn.LSTM):
            return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                    weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        else:
            return weight.new(self.num_layers, batch_size, self.hidden_size).zero_()


In [5]:
'''
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)
        
        encoder_hidden = self.encoder(source)
        decoder_input = target[:,0]
        
        for t in range(1, target_len):
            decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)
            outputs[:,t] = decoder_output
            teacher_force = torch.rand(1)<teaching_force_ratio
            top1 = decoder_output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1
            
        return outputs'''

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size, target_len = target.size()
        target_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size, device=source.device)
        
        encoder_hidden = self.encoder(source)
        decoder_input = target[:, 0]
        
        for t in range(1, target_len):
            decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)
            outputs[:, t] = decoder_output
            teacher_force = (t / target_len) < teaching_force_ratio
            decoder_input = target[:, t] if teacher_force else decoder_output.argmax(dim=1)
            
        return outputs


In [6]:
input_dim = 256
output_dim = 256
enc_emb_dim = 64
dec_emb_dim = 64
hidden_dim = 512
enc_layers = 2
dec_layers = 2
enc_rnn_cell = 'lstm'
dec_rnn_cell = 'lstm'

encoder = Encoder(input_dim, enc_emb_dim, hidden_dim, enc_layers, enc_rnn_cell)
decoder = Decoder(output_dim, dec_emb_dim, hidden_dim, dec_layers, dec_rnn_cell)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model = Seq2Seq(encoder, decoder).to(device)
print(model)

cuda
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(256, 64)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(256, 64)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc): Linear(in_features=512, out_features=256, bias=True)
  )
)


In [7]:
'''
def categorical_accuracy(preds, y, ignore_index):
    max_preds = preds.argmax(dim=1, keepdim=True)
    non_pad_elements = (y != ignore_index).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum()/torch.FloatTensor([y[non_pad_elements].shape[0]]).to(y.device)'''

def categorical_accuracy(preds, y, ignore_index):
    mask = y != ignore_index
    correct = preds.argmax(dim=1).eq(y)
    correct_masked = correct[mask]
    return correct_masked.sum().float() / mask.sum().float()


def train(model, iterator, optimizer, criterion, clip, device, ignore_index):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    
    for source, target in iterator:
        source = source.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(source, target)
        
        output_dim = output.shape[-1]
        output = output[:,1:].reshape(-1, output_dim)
        target = target[:, 1:].reshape(-1)
        
        loss = criterion(output, target)
        accuracy = categorical_accuracy(output, target, ignore_index)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
        
    return epoch_loss/len(iterator), epoch_accuracy/len(iterator)

def evaluate(model, iterator, criterion, device, ignore_index):
    model.eval()
    epoch_loss = 0
    epoch_accuracy = 0
    
    with torch.no_grad():
        for source, target in iterator:
            source = source.to(device)
            target = target.to(device)
            
            output = model(source, target, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            target = target[:, 1:].reshape(-1)
            
            loss = criterion(output, target)
            accuracy = categorical_accuracy(output, target, ignore_index)
            
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()
            
    return epoch_loss/len(iterator), epoch_accuracy/len(iterator)

In [8]:
from torch.nn.utils.rnn import pad_sequence
num_epoch = 10
clip = 1
optimizer = torch.optim.Adam(model.parameters())
ignore_index = devanagari_token2idx['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = ignore_index).to(device)

for epoch in range(num_epoch):
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, clip, device, ignore_index)
    valid_loss, valid_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)
    
    print(f'Epoch:{epoch+1}')
    print(f'Train Loss: {train_loss:.2f} | Train Accuracy: {train_accuracy:.2f}')
    print(f'Validation Loss: {valid_loss:.2f} | Validation Accuracy: {valid_accuracy:.2f}')

  latin_padded = pad_sequence([torch.tensor(latin_seq) for latin_seq, _ in batch], batch_first=True, padding_value=latin_token2idx['<pad>'])
  devanagari_padded = pad_sequence([torch.tensor(devanagari_seq) for _, devanagari_seq in batch], batch_first=True, padding_value=devanagari_token2idx['<pad>'])


Epoch:1
Train Loss: 2.69 | Train Accuracy: 0.28
Validation Loss: 2.59 | Validation Accuracy: 0.40
Epoch:2
Train Loss: 1.43 | Train Accuracy: 0.59
Validation Loss: 1.74 | Validation Accuracy: 0.62
Epoch:3
Train Loss: 0.92 | Train Accuracy: 0.71
Validation Loss: 1.64 | Validation Accuracy: 0.67
Epoch:4
Train Loss: 0.76 | Train Accuracy: 0.76
Validation Loss: 1.65 | Validation Accuracy: 0.69
Epoch:5
Train Loss: 0.67 | Train Accuracy: 0.78
Validation Loss: 1.64 | Validation Accuracy: 0.70


KeyboardInterrupt: 