## Importing libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from torchtext.data.metrics import bleu_score
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import pandas as pd
import re

import spacy
import spacy.cli
import random

from sklearn.model_selection import train_test_split

In [2]:
spacy.cli.download('en_core_web_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [3]:
spacy_eng = spacy.load('en_core_web_md')

## Loading dataset

In [5]:
root_dir = '/content/drive/MyDrive/Datasets/Machine_Translation/Hindi_English_Truncated_Corpus.csv'

In [10]:
df = pd.read_csv(root_dir, encoding='utf-8')
df = df.dropna()
df = df[df['source']=='ted']
print(len(df))
print(df.head(10).iloc[:, 1:])

39881
                                     english_sentence                                     hindi_sentence
0   politicians do not have permission to do what ...  राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर...
1          I'd like to tell you about one such child,  मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3   what we really mean is that they're bad at not...     हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7    And who are we to say, even, that they are wrong   और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13                   So there is some sort of justice                                   तो वहाँ न्याय है
23                                This changed slowly                               धीरे धीरे ये सब बदला
26                               were being produced.                           उत्पन्न नहीं कि जाती थी.
30        And you can see, this LED is going to glow.       और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।
32  to turn on the lights or to bring him a glass

In [11]:
print(df.iloc[0,1])
print(df.iloc[0,2])

politicians do not have permission to do what needs to be done.
राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .


In [12]:
print(df.iloc[37554,1])
print(df.iloc[37554,2])

And they can afford to watch how we work -
और वह वहां कर सकते हैं हमारे काम के तरीके को समझने में -


## Create vocabulary

In [13]:
class Vocabulary():
  def __init__(self, threshold=2):
    self.threshold = threshold
    self.freqs = {}
    self.itos = {0: "<unk>", 1: "<pad>", 2: "<sos>", 3: "<eos>"}
    self.stoi = {"<unk>": 0, "<pad>": 1, "<sos>": 2, "<eos>": 3}
  
  def build_vocabulary(self, df, en):
    idx = 4
    if en:
      for i in range(len(df)):
        eng_text = df.iloc[i, 1]
        tokens = [tok.text.lower() for tok in spacy_eng.tokenizer(eng_text)]
        for token in tokens:
          if token not in self.freqs:
            self.freqs[token] = 1
          else:
            self.freqs[token] += 1
          if self.freqs[token] == self.threshold:
            self.stoi[token] = idx
            self.itos[idx] = token
            idx += 1
    else:
      for i in range(len(df)):
        hindi_text = df.iloc[i, 2]
        hindi_text = re.sub('r[?,:.]।', '', hindi_text)
        for token in hindi_text.split(' '):
          if token not in self.freqs:
            self.freqs[token] = 1
          else:
            self.freqs[token] += 1
          if self.freqs[token] == self.threshold:
            self.stoi[token] = idx
            self.itos[idx] = token
            idx += 1

  def numericalize(self, text, en):
    if en:
      tokens = [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    else:
      tokens = [token for token in text.lower().split(' ')]
    
    token_to_indices = [self.stoi[token] if token in self.stoi else self.stoi['<unk>'] for token in tokens]
    token_to_indices = [self.stoi['<sos>']] + token_to_indices + [self.stoi['<eos>']]
    return token_to_indices

In [14]:
en_vocab = Vocabulary()
hi_vocab = Vocabulary()

en_vocab.build_vocabulary(df, en=True)
hi_vocab.build_vocabulary(df, en=False)

In [16]:
input_vocab_size = len(en_vocab.stoi)
output_vocab_size = len(hi_vocab.stoi)
print(f'English vocab size: {input_vocab_size}')
print(f'Hindi vocab size: {output_vocab_size}')

English vocab size: 9133
Hindi vocab size: 11833


## Create Custom Dataset

In [17]:
class MTDataset(Dataset):
  def __init__(self, df, en_vocab, hi_voab):
    self.df = df
    self.en_vocab = en_vocab
    self.hi_vocab = hi_vocab

  def __len__(self):
    return len(df)
  
  def __getitem__(self, index):

    en_text = df.iloc[index, 1]
    hi_text = df.iloc[index, 2]

    en_numericalized = en_vocab.numericalize(en_text, en=True)
    hi_numericalized = hi_vocab.numericalize(hi_text, en=False)

    return torch.tensor(en_numericalized), torch.tensor(hi_numericalized)

In [18]:
class Collate():
  def __init__(self, pad_idx):
    self.pad_idx = pad_idx
  
  def __call__(self, batch):
    (en, hi) = zip(*batch)
    
    en_pad = pad_sequence(en, batch_first=False, padding_value=self.pad_idx)
    hi_pad = pad_sequence(hi, batch_first=False, padding_value=self.pad_idx)

    return en_pad, hi_pad

In [21]:
def get_loader(df, en_vocab, hi_vocab, batch_size=32, num_workers=2, shuffle=True, pin_memory=True):
  dataset = MTDataset(df, en_vocab, hi_vocab)
  train_size = int(0.8 * len(dataset))
  test_size = len(dataset) - train_size
  train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

  pad_idx = en_vocab.stoi["<pad>"]
  train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=pin_memory, collate_fn=Collate(pad_idx))
  test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=Collate(pad_idx))
  return train_loader, test_loader

In [22]:
train_loader, test_loader = get_loader(df, en_vocab, hi_vocab)

In [25]:
print(len(train_loader))
print(len(test_loader))

997
250


In [26]:
for idx, (en, hi) in enumerate(train_loader):
  # shape of en: (eng_source_len, batch_size)
  # shape of hi: (hindi_source_len, batch_size)
  break
for idx, (en, hi) in enumerate(test_loader):
  # shape of en: (eng_source_len, batch_size)
  # shape of hi: (hindi_source_len, batch_size)
  break

## Utilities

In [32]:
def trace(x, name, arg='shape'):
  if arg == 'shape':
    print(f'Shape of {name}: {x.shape}')

## Building Encoder Architecture

In [120]:
class Encoder(nn.Module):
  def __init__(self, input_size, embed_size, hidden_size, num_layers, drop_prob):
    super(Encoder, self).__init__()
    self.input_size = input_size
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    
    self.embedding = nn.Embedding(input_size, embed_size)
    self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, bidirectional=True)
    
    self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
    self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
    self.dropout = nn.Dropout(drop_prob)

  def forward(self, x):
    # shape of x: (seq_length, batch_size)
    embedding = self.dropout(self.embedding(x))
    # shape of embedding: (seq_length, batch_size, embed_size)

    encoder_states, (hidden , cell) = self.rnn(embedding)
  
    # shape of encoder_states: (seq_length, batch_size, hidden_size*2)
    # shape of hidden: (2*num_layers, batch_size, hidden_size)
    # shape of cell: (2*num_layers, batch_size, hidden_size)

    hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
    cell = self.fc_hidden(torch.cat((cell[0:1], cell[1:2]), dim=2))
    # shape of hidden, cell: (num_layers, batch_size, hidden_size)

    return encoder_states, hidden, cell

In [121]:
# Testing encoder
X = torch.zeros((35, 32)).long()
encoder = Encoder(input_vocab_size, 300, 256, 1, drop_prob=0.5)
encoder_states, hidden, cell = encoder(X)
print(encoder_states.shape)
print(hidden.shape)
print(cell.shape)

torch.Size([35, 32, 512])
torch.Size([1, 32, 256])
torch.Size([1, 32, 256])


## Building Decoder Architecture

In [122]:
class Decoder(nn.Module):
  def __init__(self, input_size, embed_size, hidden_size, output_size, num_layers, drop_prob):
    super(Decoder, self).__init__()
    self.input_size = input_size
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    
    self.embedding = nn.Embedding(input_size, embed_size)
    self.rnn = nn.LSTM(hidden_size * 2 + embed_size, hidden_size, num_layers)

    self.energy = nn.Linear(hidden_size * 3, 1)
    self.fc = nn.Linear(hidden_size, output_size)
    self.dropout = nn.Dropout(drop_prob)
    self.softmax = nn.Softmax(dim=0)
    self.relu = nn.ReLU()

  def forward(self, x, encoder_states, hidden, cell):
    # shape of x: (batch_size)
    x = x.unsqueeze(0) # (1, batch_size)

    embedding = self.dropout(self.embedding(x))
    # shape of embedding: (1, batch_size, embed_size)

    seq_len = encoder_states.shape[0]
    
    h_reshaped = hidden.repeat(seq_len, 1, 1)
    # shape of h_reshaped: (seq_len, batch_size, hidden_size*2)
    
    energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
    # shape of energy: (seq_len, batch_size, 1)

    attention = self.softmax(energy)
    # shape of attention: (seq_len, batch_size, 1)

    attention = attention.permute(1,2,0)
    # shape of attention: (batch_size, 1, seq_len)

    encoder_states = encoder_states.permute(1,0,2)
    # shape of encoder_states: (batch_size, seq_len, hidden_size*2)

    context_vector = torch.bmm(attention, encoder_states)
    # shape of context_vector: (batch_size, 1, hidden_size * 2)

    # we want (1, batch_size, hidden_size * 2)
    context_vector = context_vector.permute(1,0,2)

    rnn_input = torch.cat([context_vector, embedding], dim=2)

    outputs, (hidden, cell) = self.rnn(rnn_input, (hidden,cell))
    # shape of outputs: (1, batch_size, hidden_size)

    predictions = self.fc(outputs)
    # shape of predictions: (1, batch_size, target_vocabulary_size)

    predictions = predictions.squeeze(0)
    # shape of predictions: (batch_size, target_vocabulary_size), since loss calculation needs this dimension

    return predictions, hidden, cell

## Seq2Seq 

In [123]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, output_vocab_size):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.output_vocab_size = output_vocab_size
  
  def forward(self, source, target, teacher_force_ratio=0.5):
    # shape of source: (num_steps_in_source, batch_size)
    # shape of target: (num_steps_in_target, batch_size)
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = self.output_vocab_size

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    encoder_states, hidden, cell = self.encoder(source)

    # First input is <sos> token
    x = target[0]

    for t in range(1, target_len):
      output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

      # store predictions for current timestep
      outputs[t] = output

      # get the best word the decoder predicted 
      best_guess = output.argmax(1)

      x = target[t] if random.random() < teacher_force_ratio else best_guess
    
    return outputs

In [124]:
# Device config
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Hyperparameters

In [125]:
# Training hyperparameters
num_epochs = 100
learning_rate = 1e-3
batch_size = 32

In [126]:
# Model hyperparameters
input_size_encoder = len(en_vocab.stoi)
input_size_decoder = len(hi_vocab.stoi)
output_size = len(hi_vocab.stoi)
encoder_embed_size = 300
decoder_embed_size = 300
hidden_size = 1024 # according to paper
num_layers = 1
enc_dropout = 0.2
dec_dropout = 0.2

In [127]:
encoder = Encoder(input_size_encoder, encoder_embed_size, hidden_size, num_layers, enc_dropout).to(device)
decoder = Decoder(input_size_decoder, decoder_embed_size, hidden_size, output_size, num_layers, enc_dropout).to(device)

In [128]:
model = Seq2Seq(encoder, decoder, output_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = en_vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
# Train
for epoch in range(num_epochs):
  running_loss = 0.0
  for batch_idx, (en, hi) in enumerate(train_loader):
    input = en.to(device)    # shape: (source_len, batch_size)
    target = hi.to(device)    # shape: (target_len, batch_size)
    
    output = model(input, target)
    # output shape: (target_len, batch_size, hindi_vocab_size)

    output = output[1:].reshape(-1, output.shape[-1])   # start from 1st index because at 0th index we have start token
    target = target[1:].reshape(-1)

    loss = criterion(output, target)
    
    optimizer.zero_grad()

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

    running_loss += loss.detach().cpu().item()

  print(f'Epoch: [{epoch+1} / {num_epochs}]\tLoss: {running_loss/len(train_loader):.6f}')

## Evaluation

In [145]:
def translate_sentence(model, sentence, en_vocab, hi_vocab, device, max_length=50):

  # Create tokens using spacy and everything in lower case (which is what our vocab is)
  if type(sentence) == str:
    tokens = [tok.text.lower() for tok in spacy_eng.tokenizer(sentence)]
  else:
    tokens = [token.lower() for token in sentence]


  # Add <SOS> and <EOS> in beginning and end respectively
  tokens.insert(0, '<sos>')
  tokens.append('<eos>')

  # Go through each english token and convert to an index
  text_to_indices = [en_vocab.stoi[token] if token in en_vocab.stoi else en_vocab.stoi['<unk>'] for token in tokens]

  # Convert to Tensor
  sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)   # (source_len x batch_size) => (tokens_length, 1)

  # Build encoder hidden, cell state
  with torch.no_grad():
      encoder_states, hidden, cell = model.encoder(sentence_tensor)

  outputs = [hi_vocab.stoi["<sos>"]]

  for _ in range(max_length):
      previous_word = torch.LongTensor([outputs[-1]]).to(device) # shape: (1)

      with torch.no_grad():
          output, hidden, cell = model.decoder(previous_word, encoder_states, hidden, cell) # output  shape: (batch_size, target_vocab_size) => (1, hindi_vocab_size)
          best_guess = output.argmax(1).item()

      outputs.append(best_guess)

      # Model predicts it's the end of the sentence
      if output.argmax(1).item() == hi_vocab.stoi["<eos>"]:
          break

  translated_sentence = [hi_vocab.itos[idx] for idx in outputs]

  # remove start token
  return translated_sentence[1:]

In [156]:
sentence = 'Can you imagine that?'
translate_sentence(model, sentence, en_vocab, hi_vocab, device, max_length=50)

['क्या', 'आप', 'ऐसा', 'कर', 'सकते', 'हैं?', '<eos>']