In [15]:


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import random
import unicodedata
import re

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:

try:
    train_df = pd.read_csv('kan_train.csv', header=None, names=['latin', 'native'])
    valid_df = pd.read_csv('kan_valid.csv', header=None, names=['latin', 'native'])
    test_df = pd.read_csv('kan_test.csv', header=None, names=['latin', 'native'])
    print("Files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please make sure 'kan_train.csv', 'kan_valid.csv', and 'kan_test.csv' are in the same directory.")

    train_df = pd.DataFrame([["namaskara", "ನಮಸ್ಕಾರ"]], columns=['latin', 'native'])
    valid_df = pd.DataFrame([["dhanyavada", "ಧನ್ಯವಾದ"]], columns=['latin', 'native'])
    test_df = pd.DataFrame([["kannada", "ಕನ್ನಡ"]], columns=['latin', 'native'])


class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.char2index = {}
        self.index2char = {}
        self.n_chars = 0

        self.add_char('<pad>')
        self.add_char('<sos>')
        self.add_char('<eos>')
        self.add_char('<unk>')

    def add_sentence(self, sentence):
        for char in sentence:
            self.add_char(char)

    def add_char(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.index2char[self.n_chars] = char
            self.n_chars += 1


source_vocab = Vocabulary('latin')
target_vocab = Vocabulary('kannada')


all_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

for _, row in all_df.iterrows():
    source_vocab.add_sentence(str(row['latin']))
    target_vocab.add_sentence(str(row['native']))

print(f"Source (Latin) Vocabulary Size: {source_vocab.n_chars}")
print(f"Target (Kannada) Vocabulary Size: {target_vocab.n_chars}")


PAD_IDX = source_vocab.char2index['<pad>']
SOS_IDX = source_vocab.char2index['<sos>']
EOS_IDX = source_vocab.char2index['<eos>']

Files loaded successfully.
Source (Latin) Vocabulary Size: 30
Target (Kannada) Vocabulary Size: 65


In [None]:


class TransliterationDataset(Dataset):
    def __init__(self, df, source_vocab, target_vocab):
        self.df = df
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        latin_word, native_word = self.df.iloc[idx]


        src_indices = [self.source_vocab.char2index.get(char, self.source_vocab.char2index['<unk>']) for char in str(latin_word)]
        trg_indices = [self.target_vocab.char2index.get(char, self.target_vocab.char2index['<unk>']) for char in str(native_word)]


        src_tensor = torch.LongTensor([SOS_IDX] + src_indices + [EOS_IDX])
        trg_tensor = torch.LongTensor([SOS_IDX] + trg_indices + [EOS_IDX])

        return src_tensor, trg_tensor


def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)

    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=PAD_IDX)
    trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=PAD_IDX)

    return src_padded, trg_padded

BATCH_SIZE = 128

train_dataset = TransliterationDataset(train_df, source_vocab, target_vocab)
valid_dataset = TransliterationDataset(valid_df, source_vocab, target_vocab)
test_dataset = TransliterationDataset(test_df, source_vocab, target_vocab)

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_iterator = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

print(f"Created DataLoaders with batch size {BATCH_SIZE}.")

Created DataLoaders with batch size 128.


In [None]:


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, cell_type='GRU', dropout=0.5):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.cell_type = cell_type.upper()

        self.embedding = nn.Embedding(input_dim, emb_dim)

        rnn_cell = getattr(nn, self.cell_type)
        self.rnn = rnn_cell(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        
        embedded = self.dropout(self.embedding(src))
       

        outputs, hidden = self.rnn(embedded)
      
       
        
        

        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, cell_type='GRU', dropout=0.5):
        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.cell_type = cell_type.upper()

        self.embedding = nn.Embedding(output_dim, emb_dim)

        rnn_cell = getattr(nn, self.cell_type)
        self.rnn = rnn_cell(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)

        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_char, hidden):
        
        input_char = input_char.unsqueeze(1)

        embedded = self.dropout(self.embedding(input_char))
      

        output, hidden = self.rnn(embedded, hidden)
       
      

        prediction = self.fc_out(output.squeeze(1))
       
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
    
      

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

       
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden = self.encoder(src)

       
        input_char = trg[:, 0]

        for t in range(1, trg_len):
      
            output, hidden = self.decoder(input_char, hidden)

          
            outputs[t] = output

  
            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1)

     
   
            input_char = trg[:, t] if teacher_force else top1

        return outputs.permute(1, 0, 2)

In [None]:


INPUT_DIM = source_vocab.n_chars
OUTPUT_DIM = target_vocab.n_chars
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
CELL_TYPE = 'GRU' 
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5


enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, CELL_TYPE, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, CELL_TYPE, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters')


optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


# --- Training and Evaluation Functions ---
def train_fn(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(tqdm(iterator, desc="Training")):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

     
        
        output_dim = output.shape[-1]

      
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate_fn(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(iterator, desc="Evaluating")):
            src, trg = batch
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) 

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

The model has 5,574,977 trainable parameters
