# Sample Transliteration Task
### Training the transformers for transliteration on a small sample of hindi to english

In [5]:
from tqdm.notebook import tqdm
import torch
from torch import nn
from transformer import Transformer
import random

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

## Dataset

In [7]:
dataset = {'रासविहारी': 'RASVIHARI', 'देवगन': 'DEVGAN', 'रोड': 'ROAD', 'शत्रुमर्दन': 'SHATRUMARDAN', 'महिजुबा': 'MAHIJUBA', 'सैबिन': 'SAIBIN', 'बिल': 'BILL', 'कॉस्बी': 'COSBY', 'रिश्ता': 'RISTA', 'कागज़': 'KAGAZ', 'का': 'KA', 'हातिम': 'HATIM', 'श्रीमयी': 'SRIMAYI', 'फरीहाह': 'FARIHAH', 'मैरीटाइम': 'MARITIME', 'म्युज़ियम': 'MUSIUM', 'ऑफ': 'OF', 'ग्रीस': 'GREECE', 'मंथन': 'MANTHAN', 'फ्रेंकोरशियन': 'FRANCORUSSIAN', 'वार': 'BAR', 'तन्मया': 'TANMYA', 'मल्ली': 'MALLI', 'केलीमुटु': 'KELIMUTU', 'मुटाटकर': 'MUTATAKAR', 'गंगा': 'GANGA', 'मैया': 'MAIYA', 'फरीदाह': 'FARIDAH', 'तहमीना': 'TAHMEENA', 'दुर्रानी': 'DURANII', 'डान्यूब': 'DANUBE', 'बलील': 'BALEEL'}

In [8]:
# Create english vocabulary
english_alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

eng_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
for index, alphabet in enumerate(english_alphabets) :
    eng_vocab[alphabet] = index + 3

In [9]:
# Create hindi vocabulary
hin_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
for num in range(2304, 2436) :
    hin_vocab[chr(num)] = num - 2301

In [10]:
# Separate the dataset into independent lists
source, target = [], []
for keys, values in dataset.items():
    source.append(keys)
    target.append(values)

## Preprocessing

In [11]:
def encode_sequence(sequence, vocab, max_len):
    '''Encode a single sequence'''
    encoded_seq = [vocab['<sos>']]
    for char in sequence:
        encoded_seq.append(vocab[char])
    encoded_seq.append(vocab['<eos>'])

    if len(encoded_seq) < max_len:
        encoded_seq.extend([vocab['<pad>']] * (max_len - len(encoded_seq)))

    return torch.LongTensor(encoded_seq)

In [12]:
def encode(sequences, vocab):
    '''Preprocesses a list of sequences'''
    max_len = max([len(s) for s in sequences]) + 2

    input_sequences = []
    for seq in sequences:
        input_sequences.append(encode_sequence(seq, vocab, max_len))
    
    return torch.stack(input_sequences)

In [13]:
X = encode(source, hin_vocab).to(device)
y = encode(target, eng_vocab).to(device)

In [14]:
pad_idx = 0
src_vocab_size = len(hin_vocab)
trg_vocab_size = len(eng_vocab)

In [15]:
model_embed = Transformer(src_vocab_size, trg_vocab_size,
                    pad_idx, pad_idx, max_len=16, device=device).to(device)

In [16]:
model_enc = Transformer(src_vocab_size, trg_vocab_size,
                    pad_idx, pad_idx, max_len=16, device=device, pos_embed=False).to(device)

## Training

In [17]:
model_embed.train(X, y)

  0%|          | 0/200 [00:00<?, ?it/s]

In [18]:
model_enc.train(X, y)

  0%|          | 0/200 [00:00<?, ?it/s]

## Prediction

In [19]:
def decode_sequence(sequence, vocab):
    '''Decode integer encoding to text'''
    rev_vocab = {v: k for k, v in vocab.items()}
    decoded = ''
    for i in sequence:
        if i > 2:
            decoded += rev_vocab[i]
    
    return decoded

In [20]:
for i in range(0, 32):
    testX = encode_sequence(list(dataset.items())[i][0], hin_vocab, max_len=16).unsqueeze(0).to(device)
    testY = encode_sequence(list(dataset.items())[i][1], eng_vocab, max_len=16).tolist()

    print('=' * 10)
    preds = model_embed.predict(testX, device)
    print("Input:", decode_sequence(testX.squeeze(0).tolist(), hin_vocab))
    print("Prediction:", decode_sequence(preds, eng_vocab))
    print("Ground Truth:", decode_sequence(testY, eng_vocab))

    print()

    preds = model_enc.predict(testX, device)
    print("Input:", decode_sequence(testX.squeeze(0).tolist(), hin_vocab))
    print("Prediction:", decode_sequence(preds, eng_vocab))
    print("Ground Truth:", decode_sequence(testY, eng_vocab))

    print()

Input: रासविहारी
Prediction: SARIHAH
Ground Truth: RASVIHARI

Input: रासविहारी
Prediction: RASVIHARI
Ground Truth: RASVIHARI

Input: देवगन
Prediction: DEVGAN
Ground Truth: DEVGAN

Input: देवगन
Prediction: DEVGAN
Ground Truth: DEVGAN

Input: रोड
Prediction: ROAD
Ground Truth: ROAD

Input: रोड
Prediction: ROAD
Ground Truth: ROAD

Input: शत्रुमर्दन
Prediction: SHATRUMARDAN
Ground Truth: SHATRUMARDAN

Input: शत्रुमर्दन
Prediction: MANCORDAN
Ground Truth: SHATRUMARDAN

Input: महिजुबा
Prediction: MAHIJUBA
Ground Truth: MAHIJUBA

Input: महिजुबा
Prediction: MAHIJUBA
Ground Truth: MAHIJUBA

Input: सैबिन
Prediction: SAIBIN
Ground Truth: SAIBIN

Input: सैबिन
Prediction: SAIBIN
Ground Truth: SAIBIN

Input: बिल
Prediction: BILL
Ground Truth: BILL

Input: बिल
Prediction: BIL
Ground Truth: BILL

Input: कॉस्बी
Prediction: COSBY
Ground Truth: COSBY

Input: कॉस्बी
Prediction: COSBY
Ground Truth: COSBY

Input: रिश्ता
Prediction: SRIMATI
Ground Truth: RISTA

Input: रिश्ता
Prediction: RISTA
Ground Truth: R

## Visualize Embeddings

In [21]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np

In [22]:
writer = SummaryWriter('runs/eng5')

In [23]:
# writer.add_embedding(model.encoder.word_embedding.weight,
#                      metadata = hin_vocab.keys(),
#                      tag = f'word embedding')

In [24]:
# writer.add_embedding(model.encoder.positional_embedding.weight,
#                      metadata = np.arange(model.encoder.positional_embedding.weight.shape[0]),
#                      tag = f'position embedding')

In [25]:
# AAA with positional embeddings
A_pos = [model_embed.decoder.word_embedding.weight[24] + model_embed.decoder.positional_embedding.weight[i] for i in range(16)]

# AAA with 1/16, 2/16, 3/16
A_val = [model_enc.decoder.word_embedding.weight[24] + (i+1 / 16) for i in range(16)]

In [26]:
writer.add_embedding(torch.stack(A_pos + A_val),
                     metadata = [f'A_pos_{i+1}' for i in range(16)] + [f'A_val_{i+1}' for i in range(16)],
                     tag = f'Positional Embed vs Value_Labelled')

In [27]:
writer.add_embedding(torch.stack(A_pos + A_val),
                     metadata = [f'A_pos' for i in range(16)] + [f'A_val' for i in range(16)],
                     tag = f'Positional Embed vs Value_Color Coded')

In [28]:
for i in range(16):
    print('\n', i)
    print('Mean', A_val[i].mean())
    print('Std', A_val[i].std())
    print('Min', A_val[i].min())
    print('Max', A_val[i].max())


 0
Mean tensor(0.0167, device='cuda:0', grad_fn=<MeanBackward0>)
Std tensor(0.9525, device='cuda:0', grad_fn=<StdBackward>)
Min tensor(-3.0886, device='cuda:0', grad_fn=<MinBackward1>)
Max tensor(2.3729, device='cuda:0', grad_fn=<MaxBackward1>)

 1
Mean tensor(1.0167, device='cuda:0', grad_fn=<MeanBackward0>)
Std tensor(0.9525, device='cuda:0', grad_fn=<StdBackward>)
Min tensor(-2.0886, device='cuda:0', grad_fn=<MinBackward1>)
Max tensor(3.3729, device='cuda:0', grad_fn=<MaxBackward1>)

 2
Mean tensor(2.0167, device='cuda:0', grad_fn=<MeanBackward0>)
Std tensor(0.9525, device='cuda:0', grad_fn=<StdBackward>)
Min tensor(-1.0886, device='cuda:0', grad_fn=<MinBackward1>)
Max tensor(4.3729, device='cuda:0', grad_fn=<MaxBackward1>)

 3
Mean tensor(3.0167, device='cuda:0', grad_fn=<MeanBackward0>)
Std tensor(0.9525, device='cuda:0', grad_fn=<StdBackward>)
Min tensor(-0.0886, device='cuda:0', grad_fn=<MinBackward1>)
Max tensor(5.3729, device='cuda:0', grad_fn=<MaxBackward1>)

 4
Mean tensor(

In [29]:
writer.close()