<a href="https://colab.research.google.com/github/myidispg/NLP-Projects/blob/master/Neural_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# My first Google Colab notebook for Neural Machine Translation in Pytorch.
---



### **The necessary imports and running the code on GPU if available.**
GPU is necessary for faster model training.

In [50]:

import requests
import tarfile
from os import path

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SOS_token = 0
EOS_token = 1

device

device(type='cuda')

##Download and save the dataset

In [51]:
dataset_url = 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz'
tgz_file = requests.get(dataset_url, stream=True)

with open("parallel.tgz", "wb") as f:
  for chunk in tgz_file.iter_content(chunk_size=1024):
    if chunk:
      f.write(chunk)
      
if path.exists("parallel.tgz"):
  print('File saved successfully.')


File saved successfully.


In [52]:
data_list = []

tar = tarfile.open("parallel.tgz")
for member in tar.getmembers():
  f = tar.extractfile(member)
  if f is not None:
    data_list.append(f)

data_list
  

[<ExFileObject name='parallel.tgz'>, <ExFileObject name='parallel.tgz'>]

### Define a class for a Language
This class will contain the word2idx, idx2word, number of words in the vocabulary and max sentence length of that language.

In [0]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        self.max_sent_length = 1
        
    def addSentence(self, sentence):
        sent_length = len(sentence.split(' '))
        self.max_sent_length = sent_length if sent_length > self.max_sent_length else self.max_sent_length        
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
            
hindi_lang = Lang('hindi')
english_lang = Lang('english')


### Use the .tgz file and read the ExFileObject to get the list of lines and read in utf-8 format

In [0]:
bytecode_lines = data_list[1].readlines()

english_lines = []

for line in bytecode_lines:
  english_lines.append(line.decode('utf-8').strip('\n'))
  
hindi_lines = []
bytecode_lines = data_list[0].readlines()

for line in bytecode_lines:
  hindi_lines.append(line.decode('utf-8').strip('\n'))
  


###Some helper functions to read the data, create pairs and the language vocabularies.

In [55]:
def addWordsToLang(lang, lines):
    for line in lines:
        lang.addSentence(line)
    
    return lang

def create_pairs(lang1, lang2):
    pairs = []

    for lang1_sent, lang2_sent in zip(lang1, lang2):
        pairs.append([lang1_sent, lang2_sent])
        
    return pairs

def createLanguagesAndPairs(lang1_lines, lang2_lines, lang1, lang2):
    
    print('Creating pairs...')
    pairs = create_pairs(lang1_lines, lang2_lines)
    
    print('Adding words to languages')
    lang1 = addWordsToLang(lang1, lang1_lines)
    lang2 = addWordsToLang(lang2, lang2_lines)
    
    print('Done creating languages')
    
    return pairs, lang1, lang2
  
pairs, hindi_lang, english_lang = createLanguagesAndPairs(hindi_lines, english_lines, hindi_lang, english_lang)


Creating pairs...
Adding words to languages
Done creating languages


## Create Encoder RNN
Create an Encoder RNN. 
It takes the input size which is the number of words in the input language vocabulary.
The other argument is the hidden state dimension. The dimensions of the embedidng is also the same as the hidden state dimensions.



![The Encoder RNN Image](https://pytorch.org/tutorials/_images/encoder-network.png)

In [0]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
    
  def forward(self, input, hidden_state):
    embedded = self.embedding(input).view(1, 1, -1)
    output = embedded
    output, hidden_state = self.gru(output, hidden_state)
    return output, hidden_state
  
  def initHidden(self):
    return torch.randn(1, 1, self.hidden_size, device=device)

##Create Decoder RNN
Create the DecoderRNN. It takes the hidden unit dimensions and the number of words in the output language vocabulary.


![DecoderRNN architecture](https://pytorch.org/tutorials/_images/decoder-network.png)

In [0]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden_state):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden_state)
        output = self.softmax(self.out(output[0]))
        return output, hidden_state
    
    def initHidden(self):
        return torch.randn(1, 1, self.hidden_size, device=device)

##Training section.
Some functions to create a sequence of inputs for each sentence pair.


In [58]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]
    
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromPairs(pair):
    input_tensor = tensorFromSentence(hindi_lang, pair[0])
    output_tensor = tensorFromSentence(english_lang, pair[1])
    return (input_tensor, output_tensor)

# This section is for testing the outputs of the Encoder
input_tensor, output_tensor = tensorFromPairs(pairs[0])

HIDDEN_DIM = 256
encoder = EncoderRNN(english_lang.n_words, HIDDEN_DIM).to(device)
decoder = DecoderRNN(HIDDEN_DIM, hindi_lang.n_words).to(device)

encoder_hidden = encoder.initHidden()

encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=0.01)

encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

encoder_output, encoder_hidden = encoder(input_tensor[0], encoder_hidden)

print(f'encoder_output- \n{encoder_output}\nencoder_hidden- {encoder_hidden}\n')


encoder_output- 
tensor([[[-6.5354e-01,  3.9343e-01,  2.6896e-01, -1.3375e-02,  3.9268e-01,
          -7.6958e-01, -2.9768e-01,  4.2736e-01,  3.3459e-01, -5.3247e-01,
           2.0292e-02,  2.7373e-02,  1.0472e+00,  2.3910e-01,  8.1970e-01,
           1.7460e-01,  1.3259e-01,  3.1073e-01, -4.4761e-01,  4.0740e-01,
           1.0853e+00, -6.5767e-01, -8.3248e-01, -1.9622e-01, -1.2883e-01,
          -4.9259e-02, -1.5747e-01,  8.7821e-01, -5.7472e-01, -8.7642e-01,
           3.1659e-01, -5.1393e-01,  7.3229e-01, -8.1574e-01, -1.8101e-01,
          -4.2936e-01, -6.9078e-01,  9.5986e-01,  6.1059e-01, -1.4773e-01,
           6.1398e-01, -2.7758e-01, -1.3197e-01,  1.0878e+00,  3.2755e-01,
           4.8859e-01,  2.0760e-01,  1.5745e-02,  9.1674e-01,  6.4979e-02,
          -3.9689e-01,  3.8731e-01, -5.7504e-01,  1.2688e+00, -6.5922e-01,
           9.0253e-01,  1.2001e+00, -6.1024e-02,  1.1487e-01,  1.3972e-01,
           6.3041e-01,  4.7182e-01, -1.0206e+00, -9.3904e-02, -1.2729e+00,
        

In [60]:
encoder_output[0, 0]

tensor([-6.5354e-01,  3.9343e-01,  2.6896e-01, -1.3375e-02,  3.9268e-01,
        -7.6958e-01, -2.9768e-01,  4.2736e-01,  3.3459e-01, -5.3247e-01,
         2.0292e-02,  2.7373e-02,  1.0472e+00,  2.3910e-01,  8.1970e-01,
         1.7460e-01,  1.3259e-01,  3.1073e-01, -4.4761e-01,  4.0740e-01,
         1.0853e+00, -6.5767e-01, -8.3248e-01, -1.9622e-01, -1.2883e-01,
        -4.9259e-02, -1.5747e-01,  8.7821e-01, -5.7472e-01, -8.7642e-01,
         3.1659e-01, -5.1393e-01,  7.3229e-01, -8.1574e-01, -1.8101e-01,
        -4.2936e-01, -6.9078e-01,  9.5986e-01,  6.1059e-01, -1.4773e-01,
         6.1398e-01, -2.7758e-01, -1.3197e-01,  1.0878e+00,  3.2755e-01,
         4.8859e-01,  2.0760e-01,  1.5745e-02,  9.1674e-01,  6.4979e-02,
        -3.9689e-01,  3.8731e-01, -5.7504e-01,  1.2688e+00, -6.5922e-01,
         9.0253e-01,  1.2001e+00, -6.1024e-02,  1.1487e-01,  1.3972e-01,
         6.3041e-01,  4.7182e-01, -1.0206e+00, -9.3904e-02, -1.2729e+00,
         2.5193e-01, -5.1038e-01, -4.6704e-01, -1.1