<a href="https://colab.research.google.com/github/myidispg/NLP-Projects/blob/master/Neural_Machine_Translation_English_Hindi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# My first Google Colab notebook for Neural Machine Translation in Pytorch.
---



### **The necessary imports and running the code on GPU if available.**
GPU is necessary for faster model training.

In [23]:

import requests
import tarfile
import os
from os import path

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SOS_token = 0
EOS_token = 1

device

device(type='cuda')

##Download and save the dataset

In [35]:
dataset_url = 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz'
tgz_file = requests.get(dataset_url, stream=True)

if path.exists("parallel.tgz"):
  os.remove('parallel.tgz')
  print('Removed the existing copy')

with open("parallel.tgz", "wb") as f:
  for chunk in tgz_file.iter_content(chunk_size=1024):
    if chunk:
      f.write(chunk)
      
if path.exists("parallel.tgz"):
  print('File saved successfully.')


Removed the existing copy
File saved successfully.


In [39]:
from urllib.request import urlretrieve
urlretrieve(dataset_url, 'parallel.tgz')

('parallel.tgz', <http.client.HTTPMessage at 0x7fa38851c828>)

In [41]:
data_list = []

tar = tarfile.open("parallel.tgz")
tarinfo = tarfile.TarInfo(tar)
for member in tar.getmembers():
  print(member)
  f = tar.extractfile(member)
  print(f)
  if f is not None:
    data_list.append(f)

data_list
  

[<TarInfo 'parallel' at 0x7fa389d7b4f8>, <TarInfo 'parallel/IITB.en-hi.hi' at 0x7fa389d7b818>, <TarInfo 'parallel/IITB.en-hi.en' at 0x7fa389d7b750>]
<TarInfo 'parallel' at 0x7fa389d7b4f8>
None
<TarInfo 'parallel/IITB.en-hi.hi' at 0x7fa389d7b818>
<ExFileObject name='parallel.tgz'>
<TarInfo 'parallel/IITB.en-hi.en' at 0x7fa389d7b750>
<ExFileObject name='parallel.tgz'>


[<ExFileObject name='parallel.tgz'>, <ExFileObject name='parallel.tgz'>]

### Define a class for a Language
This class will contain the word2idx, idx2word, number of words in the vocabulary and max sentence length of that language.

In [0]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        self.max_sent_length = 1
        
    def addSentence(self, sentence):
        sent_length = len(sentence.split(' '))
        self.max_sent_length = sent_length if sent_length > self.max_sent_length else self.max_sent_length        
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
            
hindi_lang = Lang('hindi')
english_lang = Lang('english')


### Use the .tgz file and read the ExFileObject to get the list of lines and read in utf-8 format

In [0]:
bytecode_lines = data_list[1].readlines()

english_lines = []

for line in bytecode_lines:
  english_lines.append(line.decode('utf-8').strip('\n'))
  
hindi_lines = []
bytecode_lines = data_list[0].readlines()

for line in bytecode_lines:
  hindi_lines.append(line.decode('utf-8').strip('\n'))
  


###Some helper functions to read the data, create pairs and the language vocabularies.

In [46]:
def addWordsToLang(lang, lines):
    for line in lines:
        lang.addSentence(line)
    
    return lang

def create_pairs(lang1, lang2):
    pairs = []

    for lang1_sent, lang2_sent in zip(lang1, lang2):
        pairs.append([lang1_sent, lang2_sent])
        
    return pairs

def createLanguagesAndPairs(lang1_lines, lang2_lines, lang1, lang2):
    
    print('Creating pairs...')
    pairs = create_pairs(lang1_lines, lang2_lines)
    
    print('Adding words to languages')
    lang1 = addWordsToLang(lang1, lang1_lines)
    lang2 = addWordsToLang(lang2, lang2_lines)
    
    print('Done creating languages')
    
    return pairs, lang1, lang2
  
pairs, hindi_lang, english_lang = createLanguagesAndPairs(hindi_lines, english_lines, hindi_lang, english_lang)

MAX_LENGTH = english_lang.max_sent_length if english_lang.max_sent_length > hindi_lang.max_sent_length else hindi_lang.max_sent_length
print(f'No of words in english: {english_lang.n_words}, No of words in hindi: {hindi_lang.n_words}, Max length of sentence in both: {MAX_LENGTH}')


Creating pairs...
Adding words to languages
Done creating languages
No of words in english: 462621, No of words in hindi: 536013, Max length of sentence in both: 1917


## Create Encoder RNN
Create an Encoder RNN. 
It takes the input size which is the number of words in the input language vocabulary.
The other argument is the hidden state dimension. The dimensions of the embedidng is also the same as the hidden state dimensions.



![The Encoder RNN Image](https://pytorch.org/tutorials/_images/encoder-network.png)

In [0]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
    
  def forward(self, input, hidden_state):
    embedded = self.embedding(input).view(1, 1, -1)
    output = embedded
    output, hidden_state = self.gru(output, hidden_state)
    return output, hidden_state
  
  def initHidden(self):
    return torch.randn(1, 1, self.hidden_size, device=device)

##Create Decoder RNN
Create the DecoderRNN. It takes the hidden unit dimensions and the number of words in the output language vocabulary.


![DecoderRNN architecture](https://pytorch.org/tutorials/_images/decoder-network.png)

In [0]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden_state):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden_state)
        output = self.softmax(self.out(output[0]))
        return output, hidden_state
    
    def initHidden(self):
        return torch.randn(1, 1, self.hidden_size, device=device)

##Training section.
Some functions to create a sequence of inputs for each sentence pair.


In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]
    
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromPairs(pair):
    input_tensor = tensorFromSentence(hindi_lang, pair[0])
    output_tensor = tensorFromSentence(english_lang, pair[1])
    return (input_tensor, output_tensor)

# This section is for testing the outputs of the Encoder
input_tensor, output_tensor = tensorFromPairs(pairs[0])

HIDDEN_DIM = 256
encoder = EncoderRNN(english_lang.n_words, HIDDEN_DIM).to(device)
decoder = DecoderRNN(HIDDEN_DIM, hindi_lang.n_words).to(device)

encoder_hidden = encoder.initHidden()

encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=0.01)

encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

encoder_output, encoder_hidden = encoder(input_tensor[0], encoder_hidden)

print(f'encoder_output- \n{encoder_output}\nencoder_hidden- {encoder_hidden}\n')


**Teacher forcing** is the concept of using the real target outputs as each next input,  instead of using the decoder’s guess as the next input. Using teacher forcing  causes it to converge faster but when the trained network is exploited, it may exhibit instability.

In [0]:
teacher_forcing_ratio = 0.5

def train(input_tensor, output_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
  encoder_hidden = encoder.initHidden()
  
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()
  
  input_length = input_tensor.shape[0]
  output_length = output_tensor.shape[0]
  
  encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device-device)
  
  loss = 0
  
  for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input, encoder_hidden)
    encoder_outputs[ei] = encoder_output[0, 0]
    
  decoder_input = torch.tensor([[SOS_TOKEN]], device=device)
  
  decoder_hidden = encoder_hidden
  
  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
  
  if use_teacher_forcing:
    for di in range(output_length):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      
      loss += criterion(decoder_output, output_tensor[di])
      decoder_input = output_tensor[di]
      
  else:
    for di in range(output_length):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach()
      loss += criterion(decoder_output, outptu_tensor[di])
      if decoder_input.item() == EOS_TOKEN:
        break
        
  loss.backward()
  encoder_optimizer.step()
  decoder_optimizer.step()
  
  return loss.item() / output_length
 
    
    

###Some functions to find time elapsed
These functions help to calculate the elapsed time and the remaining time.


In [0]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### This function goes through all the pairs and calls the train() function.

In [0]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
  start = time.time()
  plot_losses = []
  print_loss_total = 0
  plot_loss_total = 0
  
  training_pairs = [tensorsFromPairs(random.choice(pairs)) for i in range(n_iters)]
  print(f'The number of training_pairs is {len(training_pairs)}\n\n\n')
  
  encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
  
  for iter in range(1, n_iters+1):
    training_pair = training_pairs[iter-1]
    input_tensor = training_pair[0]
    output_tensor = training_pair[1]
    
    loss = train(input_tensor, output_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length = MAX_LENGTH)
    
    print_loss_total += loss
    plot_loss_total += loss
    
    if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
    
  