<a href="https://colab.research.google.com/github/myidispg/NLP-Projects/blob/master/Neural_Machine_Translation_English_Hindi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# My first Google Colab notebook for Neural Machine Translation in Pytorch.
---



### **The necessary imports and running the code on GPU if available.**
GPU is necessary for faster model training.

In [7]:
import requests
import tarfile
import os
from os import path

import random
import re
import unicodedata

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SOS_token = 0
EOS_token = 1

device

device(type='cuda')

##Download and save the dataset

In [0]:
dataset_url = 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz'
# tgz_file = requests.get(dataset_url, stream=True)

# if path.exists("parallel.tgz"):
#   os.remove('parallel.tgz')
#   print('Removed the existing copy')

# with open("parallel.tgz", "wb") as f:
#   for chunk in tgz_file.iter_content(chunk_size=1024):
#     if chunk:
#       f.write(chunk)
      
# if path.exists("parallel.tgz"):
#   print('File saved successfully.')


In [9]:
if path.exists('parallel.tgz'):
  print('Deleting existing copy.')
  os.remove('parallel.tgz')
  print('Downloading dataset...')
  from urllib.request import urlretrieve
  urlretrieve(dataset_url, 'parallel.tgz')
  print('Dataset downloaded successfully!')
else:
  print('Downloading dataset...')
  from urllib.request import urlretrieve
  urlretrieve(dataset_url, 'parallel.tgz')
  print('Dataset downloaded successfully!')  


Deleting existing copy.
Downloading dataset...
Dataset downloaded successfully!


In [10]:
data_list = []

tar = tarfile.open("parallel.tgz")
tarinfo = tarfile.TarInfo(tar)
for member in tar.getmembers():
#   print(member)
  f = tar.extractfile(member)
#   print(f)
  if f is not None:
    data_list.append(f)

data_list
  

[<ExFileObject name='parallel.tgz'>, <ExFileObject name='parallel.tgz'>]

### Define a class for a Language
This class will contain the word2idx, idx2word, number of words in the vocabulary and max sentence length of that language.

In [0]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        self.max_sent_length = 1
        
    def addSentence(self, sentence):
        sent_length = len(sentence.split(' '))
        self.max_sent_length = sent_length if sent_length > self.max_sent_length else self.max_sent_length        
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
            
hindi_lang = Lang('hindi')
english_lang = Lang('english')


### Use the .tgz file and read the ExFileObject to get the list of lines and read in utf-8 format

In [12]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

bytecode_lines = data_list[1].readlines()

english_lines = []

for line in bytecode_lines:
  english_lines.append(normalizeString(line.decode('utf-8').strip('\n')))
  
hindi_lines = []
bytecode_lines = data_list[0].readlines()

for line in bytecode_lines:
  hindi_lines.append(line.decode('utf-8').strip('\n'))
  
print(f' The first 10 english lines are- {english_lines[:10]}')
print(f' The first 10 hindi lines are- {hindi_lines[:10]}')

 The first 10 english lines are- ['give your application an accessibility workout', 'accerciser accessibility explorer', 'the default plugin layout for the bottom panel', 'the default plugin layout for the top panel', 'a list of plugins that are disabled by default', 'highlight duration', 'the duration of the highlight box when selecting accessible nodes', 'highlight border color', 'the color and opacity of the highlight border .', 'highlight fill color']
 The first 10 hindi lines are- ['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'एक्सेर्साइसर पहुंचनीयता अन्वेषक', 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका', 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका', 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है', 'अवधि को हाइलाइट रकें', 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि', 'सीमांत (बोर्डर) के रंग को हाइलाइट करें', 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ', 'भराई के रंग को हाइलाइट करें']


###Some helper functions to read the data, create pairs and the language vocabularies.

In [13]:
def addWordsToLang(lang, lines):
    for line in lines:
        lang.addSentence(line)
    
    return lang

def create_pairs(lang1, lang2):
    pairs = []

    for lang1_sent, lang2_sent in zip(lang1, lang2):
        pairs.append([lang1_sent, lang2_sent])
        
    return pairs

def createLanguagesAndPairs(lang1_lines, lang2_lines, lang1, lang2):
    
    print('Creating pairs...')
    pairs = create_pairs(lang1_lines, lang2_lines)
    
    print('Adding words to languages')
    lang1 = addWordsToLang(lang1, lang1_lines)
    lang2 = addWordsToLang(lang2, lang2_lines)
    
    print('Done creating languages')
    
    return pairs, lang1, lang2
  
pairs, english_lang, hindi_lang = createLanguagesAndPairs(english_lines, hindi_lines, english_lang, hindi_lang)

print(f'The first 10 pairs are- {pairs[:10]}\n')

MAX_LENGTH = english_lang.max_sent_length if english_lang.max_sent_length > hindi_lang.max_sent_length else hindi_lang.max_sent_length
print(f'No of words in english: {english_lang.n_words}, No of words in hindi: {hindi_lang.n_words}, Max length of sentence in both: {MAX_LENGTH}')


Creating pairs...
Adding words to languages
Done creating languages
The first 10 pairs are- [['give your application an accessibility workout', 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'], ['accerciser accessibility explorer', 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'], ['the default plugin layout for the bottom panel', 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'], ['the default plugin layout for the top panel', 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका'], ['a list of plugins that are disabled by default', 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है'], ['highlight duration', 'अवधि को हाइलाइट रकें'], ['the duration of the highlight box when selecting accessible nodes', 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि'], ['highlight border color', 'सीमांत (बोर्डर) के रंग को हाइलाइट करें'], ['the color and opacity of the highlight border .', 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। '], ['highlight fill color', 'भराई के रंग को हाइलाइट करें']]

No of words in engl

## Create Encoder RNN
Create an Encoder RNN. 
It takes the input size which is the number of words in the input language vocabulary.
The other argument is the hidden state dimension. The dimensions of the embedidng is also the same as the hidden state dimensions.



![The Encoder RNN Image](https://pytorch.org/tutorials/_images/encoder-network.png)

In [0]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
       
    self.embedding = nn.Embedding(input_size, hidden_size) # Dimensions-> hidden_size
    self.gru = nn.GRU(hidden_size, hidden_size) # input dimension is hidden_size due to embedding ^
        
  def forward(self, input, hidden):
    embedded = self.embedding(input).view(1, 1, -1)
    output = embedded
    output, hidden = self.gru(output, hidden)
#     print(f'Encoder Output shape: {output.shape}')
    return output, hidden
    
  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)


##Create Decoder RNN
Create the DecoderRNN. It takes the hidden unit dimensions and the number of words in the output language vocabulary.


![DecoderRNN architecture](https://pytorch.org/tutorials/_images/decoder-network.png)

In [0]:
class DecoderRNN(nn.Module):
   
  def __init__(self, hidden_size, output_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
    self.out = nn.Linear(hidden_size, output_size) # This turns the output to be of dimension -> embedding dimension x no of words in output vocab.
    self.softmax = nn.LogSoftmax(dim=1)
        
  def forward(self, input, hidden_state):
    output = self.embedding(input).view(1, 1, -1)
    output = F.relu(output)
    output, hidden_state = self.gru(output, hidden_state)
#     print(f'Decoder output shape: {self.out(output[0]).shape}')
    output = self.softmax(self.out(output[0]))
#     print(f'Decoder embedding shape: {output.shape}')
    return output, hidden_state
    
  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

##Training section.
Some functions to create a sequence of inputs for each sentence pair.


In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]
    
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromPairs(pair):
    input_tensor = tensorFromSentence(english_lang, pair[0])
    output_tensor = tensorFromSentence(hindi_lang, pair[1])
    return (input_tensor, output_tensor)

# # This section is for testing the outputs of the Encoder
# input_tensor, output_tensor = tensorFromPairs(pairs[0])

# HIDDEN_DIM = 256
# encoder = EncoderRNN(english_lang.n_words, HIDDEN_DIM).to(device)
# decoder = DecoderRNN(HIDDEN_DIM, hindi_lang.n_words).to(device)

# encoder_hidden = encoder.initHidden()

# encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.01)
# decoder_optimizer = optim.SGD(decoder.parameters(), lr=0.01)

# encoder_optimizer.zero_grad()
# decoder_optimizer.zero_grad()

# encoder_output, encoder_hidden = encoder(input_tensor[0], encoder_hidden)

# print(f'encoder_output- \n{encoder_output}\nencoder_hidden- {encoder_hidden}\n')


In [18]:
input_tensor, output_tensor = tensorFromPairs(pairs[1000])
print(f'Input tensor- {input_tensor}\nOutput tensor- {output_tensor}')

Input tensor- tensor([[26],
        [27],
        [ 1]], device='cuda:0')
Output tensor- tensor([[31],
        [ 4],
        [32],
        [33],
        [ 1]], device='cuda:0')


**Teacher forcing** is the concept of using the real target outputs as each next input,  instead of using the decoder’s guess as the next input. Using teacher forcing  causes it to converge faster but when the trained network is exploited, it may exhibit instability.

In [0]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
  encoder_hidden = encoder.initHidden()
  
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()
  
  input_length = input_tensor.size(0)
  output_length = target_tensor.size(0)
#   print(f'target tensor: {target_tensor} and output length: {output_length}')
  
  encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
  
  loss = 0
  
  for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
    encoder_outputs[ei] = encoder_output[0, 0]
    
  decoder_input = torch.tensor([[SOS_token]], device=device)
  
  decoder_hidden = encoder_hidden
  
  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
  
  if use_teacher_forcing:
    for di in range(output_length):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
#       print(f'decoder output: {decoder_output.shape}, output_tensor[{di}]: {output_tensor[di]}')
#       print(f'loss: {criterion(decoder_output, output_tensor[di])}')
#       print(f'{di}')
      loss += criterion(decoder_output, target_tensor[di])
      decoder_input = target_tensor[di]
      
  else:
    for di in range(output_length):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach()
      loss += criterion(decoder_output, target_tensor[di])
      if decoder_input.item() == EOS_token:
        break
        
  loss.backward()
  
  encoder_optimizer.step()
  decoder_optimizer.step()
  
  return loss.item() / output_length  


def train(input_tensor, output_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.shape[0]
    output_length = output_tensor.shape[0]
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    loss = 0
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
    
    decoder_input = torch.tensor([[SOS_token]], device=device)
    
    decoder_hidden = encoder_hidden
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(output_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
            loss += criterion(decoder_output, output_tensor[di])
            decoder_input = output_tensor[di]  # Teacher forcing
    else:
        for di in range(output_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach() # detach from history as input
            loss += criterion(decoder_output, output_tensor[di])
            if decoder_input.item() == EOS_token:
                break
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item() / output_length

###Some functions to find time elapsed
These functions help to calculate the elapsed time and the remaining time.


In [0]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

###Function to plot the losses


In [0]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    

### This function goes through all the pairs and calls the train() function.

In [0]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=10000, learning_rate=0.01):
  start = time.time()
  plot_losses = []
  print_loss_total = 0
  plot_loss_total = 0
  
  training_pairs = [tensorFromPairs(random.choice(pairs)) for i in range(n_iters)]
  print(f'The number of training_pairs is {len(training_pairs)}\n\n\n')
  
  encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
  
  for iter in range(1, n_iters+1):
    training_pair = training_pairs[iter-1]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]
    
    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length = MAX_LENGTH)
    
    print_loss_total += loss
    plot_loss_total += loss
    
    if iter % print_every == 0:
      print_loss_avg = print_loss_total / print_every
      print_loss_total = 0
      print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

    if iter % plot_every == 0:
      plot_loss_avg = plot_loss_total / plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total = 0
  
  showPlot(plot_losses) 

##Evaluation



In [0]:
def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):
  with torch.no_grad():
    input_tensor = tensorFromSentence(input_lang, sentence)
    input_length = input_tensor.shape[0]
    encoder_hidden = encoder.initHidden()
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    for ei in range(input_length):
      encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
      encoder_outputs[ei] += encoder_output[0, 0]
      
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_attentions[di] = decoder_attention.data
      topv, topi = decoder_output.data.topk(1)
      if topi.item() == EOS_token:
          decoded_words.append('<EOS>')
          break
      else:
          decoded_words.append(output_lang.index2word[topi.item()])
      decoder_input = topi.squeeze().detach()

    return decoded_words, decoder_attentions[:di + 1]
      
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')    

##Training the Seq2Seq Model now.

In [0]:
HIDDEN_DIM = 32

print(f'Executing training on: {device}')
encoder = EncoderRNN(english_lang.n_words, HIDDEN_DIM).to(device)
decoder = DecoderRNN(HIDDEN_DIM, hindi_lang.n_words).to(device)

criterion = nn.NLLLoss()

print('Training model now...')
trainIters(encoder, decoder, 75000, print_every=250, learning_rate=0.005)


Executing training on: cuda
Training model now...
The number of training_pairs is 75000



0m 46s (- 230m 29s) (250 0%) 8.5635
1m 14s (- 185m 22s) (500 0%) 6.4313
1m 49s (- 180m 23s) (750 1%) 6.3634
2m 26s (- 180m 53s) (1000 1%) 6.2075
3m 3s (- 180m 19s) (1250 1%) 6.2043
