In [None]:
# Imports & Setup
import os
import io
import time
import random
import unicodedata
import string
import re

.
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

from utils import (
    load_pairs, prepare_vocab, word2tensor,
    EncoderRNN, DecoderRNN,
    SOS_token, EOS_token, device
)


In [None]:
# Device Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
# Load Dataset (Dakshina)
# Path to Dakshina dataset (change if needed)
data_path = "../dakshina_dataset_v1.0/hi/lexicons/hi.transliteration.train.tsv"

pairs = []
with open(data_path, "r", encoding="utf-8") as f:
    for line in f:
        native, latin, _ = line.strip().split("\t")
        pairs.append((latin, native))

print("Total pairs:", len(pairs))
print("Example:", pairs[0])


In [None]:
# Preprocessing & Vocabulary
SOS_token = 0
EOS_token = 1

class Vocab:
    def __init__(self, name):
        self.name = name
        self.char2index = {"SOS": SOS_token, "EOS": EOS_token}
        self.char2count = {}
        self.index2char = {SOS_token: "SOS", EOS_token: "EOS"}
        self.n_chars = 2

    def add_word(self, word):
        for ch in word:
            self.add_char(ch)

    def add_char(self, ch):
        if ch not in self.char2index:
            self.char2index[ch] = self.n_chars
            self.char2count[ch] = 1
            self.index2char[self.n_chars] = ch
            self.n_chars += 1
        else:
            self.char2count[ch] += 1

def prepare_vocab(pairs):
    src_vocab = Vocab("Latin")
    tgt_vocab = Vocab("Native")
    for src, tgt in pairs:
        src_vocab.add_word(src)
        tgt_vocab.add_word(tgt)
    return src_vocab, tgt_vocab

src_vocab, tgt_vocab = prepare_vocab(pairs)
print("Latin vocab size:", src_vocab.n_chars)
print("Native vocab size:", tgt_vocab.n_chars)


In [None]:
# Helper: Tensor Conversion
def word2tensor(vocab, word):
    indexes = [vocab.char2index[ch] for ch in word] + [EOS_token]
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


In [None]:
# Encoder Model
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, cell_type="LSTM"):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        if cell_type == "LSTM":
            self.rnn = nn.LSTM(hidden_size, hidden_size)
        elif cell_type == "GRU":
            self.rnn = nn.GRU(hidden_size, hidden_size)
        else:
            self.rnn = nn.RNN(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [None]:
# Decoder Model
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, cell_type="LSTM"):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        if cell_type == "LSTM":
            self.rnn = nn.LSTM(hidden_size, hidden_size)
        elif cell_type == "GRU":
            self.rnn = nn.GRU(hidden_size, hidden_size)
        else:
            self.rnn = nn.RNN(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden


In [None]:
# Training Step
teacher_forcing_ratio = 0.5

def train_step(input_tensor, target_tensor, encoder, decoder,
               encoder_optimizer, decoder_optimizer, criterion, max_length=30):
    encoder_hidden = encoder.init_hidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    for ei in range(input_length):
        _, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


In [None]:
# Training Loop
def train_iters(pairs, encoder, decoder, n_iters=1000, learning_rate=0.01, print_every=100):
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for it in range(1, n_iters+1):
        src, tgt = random.choice(pairs)
        input_tensor = word2tensor(src_vocab, src)
        target_tensor = word2tensor(tgt_vocab, tgt)

        loss = train_step(input_tensor, target_tensor, encoder, decoder,
                          encoder_optimizer, decoder_optimizer, criterion)

        if it % print_every == 0:
            print(f"Iter {it}, Loss {loss:.4f}")


In [None]:
# Evaluation
def evaluate(encoder, decoder, word, max_length=30):
    with torch.no_grad():
        input_tensor = word2tensor(src_vocab, word)
        encoder_hidden = encoder.init_hidden()
        for ei in range(input_tensor.size(0)):
            _, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden

        decoded_chars = []
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                break
            else:
                decoded_chars.append(tgt_vocab.index2char[topi.item()])

            decoder_input = topi.squeeze().detach()

        return ''.join(decoded_chars)


In [None]:
# Run Training + Test
hidden_size = 256
encoder = EncoderRNN(src_vocab.n_chars, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, tgt_vocab.n_chars).to(device)

train_iters(pairs, encoder, decoder, n_iters=2000, print_every=200)

# Test some predictions
for word, tgt in random.sample(pairs, 5):
    pred = evaluate(encoder, decoder, word)
    print(f"{word} -> {pred} (target: {tgt})")
