<a href="https://colab.research.google.com/github/m-shilpa/END3/blob/main/Session_6_RNN_LSTM_With_Attention_Mechanism/END3_Session_6_Quora_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [2]:
!wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv



--2022-01-08 16:27:18--  http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
Resolving qim.fs.quoracdn.net (qim.fs.quoracdn.net)... 151.101.1.2, 151.101.65.2, 151.101.129.2, ...
Connecting to qim.fs.quoracdn.net (qim.fs.quoracdn.net)|151.101.1.2|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58176133 (55M) [text/tab-separated-values]
Saving to: ‘quora_duplicate_questions.tsv’


2022-01-08 16:27:20 (220 MB/s) - ‘quora_duplicate_questions.tsv’ saved [58176133/58176133]



In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("quora_duplicate_questions.tsv",sep='\t')

In [5]:
df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
df.shape

(404290, 6)

In [7]:
df.isna().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [8]:
df[(df['question1'].isna())|(df['question2'].isna())]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [9]:
df = df.dropna()
df.shape

(404287, 6)

In [10]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, filename,reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    df = pd.read_csv(filename,sep='\t')
    df = df.dropna()
    df = df[['question1','question2']]
    for col in df.columns:
        # normalize
        df[col] =df[col].str.strip()
        df[col] = df[col].apply(normalizeString)

    pairs = df.values.tolist()
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


MAX_LENGTH = 8



def filterPair(p):
    try:
        r = len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH 
        return r 
    except:
        print('Error',p)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


def prepareData(lang1, lang2,filename, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2,filename, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name,':', input_lang.n_words)
    print(output_lang.name,':', output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('Question Set1', 'Question Set2',"quora_duplicate_questions.tsv", True)
print(random.choice(pairs))

Reading lines...
Read 404287 sentence pairs
Trimmed to 28944 sentence pairs
Counting words...
Counted words:
Question Set2 : 13523
Question Set1 : 14013
['how can i stop smoking ?', 'how do i quit smoking ?']


In [11]:
type(pairs)

list

In [12]:
pairs[0:5]

[['what does manipulation means ?', 'what does manipulation mean ?'],
 ['what is the web application framework ?', 'what is web application ?'],
 ['will squats make my legs thicker ?', 'why do slavs squat ?'],
 ['how to make friends ?', 'how do i make friends .'],
 ['aerodynamically what happens when propellor rotates ?',
  'nd she is always sad ?']]

# The architecture we are building

![image](https://miro.medium.com/max/1838/1*tXchCn0hBSUau3WO0ViD7w.jpeg)

As we can see here, we will have an encoder, an attention mechanism block and decoder. In the final code the attention mechanicm block and decoder will be merged into single block as we need both to work together. 

As we can see here, we need to create a copy of h1, h2, h3 and h4. These are encoder outputs for a sentence with 4 words. 

# Encoder

We will build our encoder with a GRU, but that's all we know. Let's NOT strait away build a class, but see how to come up with one for the Encoder. We need to answer few questions first:
1. what would be the hidden size of our GRU
2. What would be the input size
3. What would be the embedding dimesions. 

For simplicity, lets keep 1. and 3. to be 256. 

We can't feed our input directly to GRU, we need to tensorize it, convert to embeddings first. 

`embedding = nn.Embedding(input_size, hidden_size) `

## What is input_size?

Remember the line below?

`input_lang, output_lang, pairs = prepareData('eng', 'fra', True)`

In [13]:
input_lang

<__main__.Lang at 0x7f55d92d04d0>

In [14]:
help(input_lang)

Help on Lang in module __main__ object:

class Lang(builtins.object)
 |  Lang(name)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, name)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  addSentence(self, sentence)
 |  
 |  addWord(self, word)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [15]:
input_lang.__dict__.items()



In [16]:
input_size = input_lang.n_words
hidden_size = 256
input_size

13523

In [17]:
embedding = nn.Embedding(input_size, hidden_size)
gru = nn.GRU(hidden_size, hidden_size)

In [18]:
sample = random.choice(pairs)
sample

['can i be rich ?', 'how can one get insanely rich ?']

In [19]:
device

device(type='cuda')

In [20]:
embedding_input = embedding(sample[0])

TypeError: ignored

In [21]:
sample

['can i be rich ?', 'how can one get insanely rich ?']

In [22]:
input_sentence = sample[0]
output_sentence = sample[1]

input_lang.word2index['you']

105

In [23]:
for word in input_sentence.split(' '):
  print(word)

can
i
be
rich
?


In [24]:
input_indices = [input_lang.word2index[word] for word in input_sentence.split(' ')]
output_indices = [output_lang.word2index[word] for word in output_sentence.split(' ')]
input_indices, output_indices

([54, 55, 315, 755, 6], [14, 92, 93, 182, 5355, 763, 6])

In [25]:
embedding_input = embedding(input_indices)

TypeError: ignored

In [26]:
input_indices.append(EOS_token)
output_indices.append(EOS_token)
input_indices, output_indices

([54, 55, 315, 755, 6, 1], [14, 92, 93, 182, 5355, 763, 6, 1])

In [27]:
input_tensor = torch.tensor(input_indices, dtype=torch.long, device=device)
output_tensor = torch.tensor(output_indices, dtype=torch.long, device=device)

In [28]:
input_tensor.shape, output_tensor.shape

(torch.Size([6]), torch.Size([8]))

In [29]:
embedding_input = embedding(input_tensor)

RuntimeError: ignored

In [30]:
embedding = nn.Embedding(input_size, hidden_size).to(device)
gru = nn.GRU(hidden_size, hidden_size).to(device)

In [31]:
embedding_input = embedding(input_tensor)
embedding_input.shape

torch.Size([6, 256])

In [32]:
input_tensor

tensor([ 54,  55, 315, 755,   6,   1], device='cuda:0')

In [33]:
input_tensor.shape, input_tensor.view(-1, 1).shape

(torch.Size([6]), torch.Size([6, 1]))

In [34]:
print(embedding_input.shape)
embedding_input = embedding(input_tensor.view(-1, 1))
print(embedding_input.shape)

torch.Size([6, 256])
torch.Size([6, 1, 256])


In [35]:
# output, hidden = gru(embedde_input, ?)
hidden = torch.zeros(1, 1, 256, device = device)

In [36]:
embedding_input = embedding(input_tensor.view(-1, 1))
output, hidden = gru(embedding_input, hidden)

output.shape, output[0, 0].shape

(torch.Size([6, 1, 256]), torch.Size([256]))

In [37]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
encoder_outputs.shape

torch.Size([8, 256])

In [38]:
input_tensor.size()[0]

6

In [39]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
hidden = torch.zeros(1, 1, 256, device = device)

for i in range(input_tensor.size()[0]):
  embedding_input = embedding(input_tensor[i].view(-1, 1))
  output, hidden = gru(embedding_input, hidden)
  encoder_outputs[i] += output[0, 0]

In [40]:
encoder_outputs.shape, hidden.shape

(torch.Size([8, 256]), torch.Size([1, 1, 256]))

In [41]:
encoder_outputs[0:4]

tensor([[ 0.0657, -0.1953,  0.2072,  ..., -0.1110,  0.1129,  0.1399],
        [-0.1821,  0.2091, -0.1638,  ...,  0.0520, -0.2713, -0.3054],
        [ 0.0318,  0.1323, -0.1986,  ...,  0.1906, -0.0163, -0.1947],
        [-0.4393,  0.3280,  0.4385,  ...,  0.0144, -0.3822,  0.0350]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [42]:
encoder_outputs[7:10]

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

# 😁

Finally our Encoder is fully ready. Now let's look at the class we wrote in the last class to see what we missed!

```
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
```

Cool! Next let's build out Decoder where we have attention in-built.

# Decoder with Attention

Here is the plan. 

1. First input to the decoder will be SOS_token, later inputs would be the words it predicted (unless we implement teacher forcing)
2. decoder/GRU's hidden state will be initialized with the encoder's last hidden state
3. we will use gru's hidden state and last prediction to generate attention weight using a FC layer. 
4. this attention weight will be used to weigh the encoder_outputs using batch matric multiplication. This will give us a NEW view on how to look at encoder_states.
5. this attention applied encoder_states will then be concatenated with the input, and then sent a linear layer and _then_ sent to the GRU. 
6. GRU's output will be sent to a FC layer to predict one of the output_language words

Let's prepare all the inputs we need to do this


In [43]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden
decoded_words = []

In [44]:
# decoder s0
output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)
embedded.shape

torch.Size([1, 1, 256])

In [55]:
# 256 * 2 >> after concatenation
attn_weight_layer = nn.Linear(256 * 2, MAX_LENGTH).to(device)

In [56]:
embedded.shape, decoder_hidden.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [57]:
torch.cat((embedded[0], decoder_hidden[0]), 1).shape

torch.Size([1, 512])

In [58]:
attn_weight_layer = nn.Linear(256 * 2, MAX_LENGTH).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights

tensor([[-0.0948, -0.1884,  0.2930, -0.6148, -0.1815, -0.0483, -0.3276,  0.6079]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [59]:
import torch.nn.functional as F
attn_weight_layer = nn.Linear(256 * 2, MAX_LENGTH).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_weights


tensor([[0.0710, 0.1631, 0.1744, 0.0962, 0.1307, 0.1293, 0.0776, 0.1578]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [60]:
attn_weights.shape, encoder_outputs.shape

(torch.Size([1, 8]), torch.Size([8, 256]))

In [61]:
attn_applied = torch.bmm(attn_weights, encoder_outputs)

RuntimeError: ignored

In [62]:
attn_weights.unsqueeze(0).shape, encoder_outputs.unsqueeze(0).shape

(torch.Size([1, 1, 8]), torch.Size([1, 8, 256]))

In [64]:
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
attn_applied.shape

torch.Size([1, 1, 256])

So, now we have this 256dm attn_applied encoder_outputs capturing what we should focus on on this step. We also have the input we already generated. That's 256dm again. GRU is gonna take 256 only. So we need to concatenate them, send to a linear layer to reduce dimensions, and then send to Gru
![image](https://static.wikia.nocookie.net/mycun-the-movie/images/c/c2/Gru-icon.png/revision/latest/scale-to-width-down/250?cb=20151223171656)

In [65]:
input_to_gru_layer = nn.Linear(256 * 2, 256).to(device)
embedded.shape, attn_applied.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [66]:
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru.shape

torch.Size([1, 256])

In [67]:
gru = nn.GRU(256, 256).to(device)
decoder_hidden.shape, input_to_gru.shape

(torch.Size([1, 1, 256]), torch.Size([1, 256]))

In [68]:
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
decoder_hidden.shape, input_to_gru.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [69]:
output, decoder_hidden = gru(decoder_hidden, input_to_gru)
output.shape, decoder_hidden.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [70]:
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)

In [71]:
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim=1)
output.shape, output, output.data.topk(1)

(torch.Size([1, 14013]),
 tensor([[7.4365e-05, 7.2511e-05, 7.8606e-05,  ..., 7.5594e-05, 7.1092e-05,
          8.6676e-05]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 torch.return_types.topk(values=tensor([[9.7991e-05]], device='cuda:0'), indices=tensor([[1690]], device='cuda:0')))

In [72]:
topv, topi = output.data.topk(1)
output_lang.index2word[topi.item()]

'scattering'

In [74]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden #decoder_hidden = encoder_hidden
output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)
attn_weight_layer = nn.Linear(256 * 2, MAX_LENGTH).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru_layer = nn.Linear(256 * 2, 256).to(device)
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
gru = nn.GRU(256, 256).to(device)
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()]



'bikinis'

In [75]:
embedding = nn.Embedding(output_size, 256).to(device)
attn_weight_layer = nn.Linear(256 * 2, MAX_LENGTH).to(device)
input_to_gru_layer = nn.Linear(256 * 2, 256).to(device)
gru = nn.GRU(256, 256).to(device)
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)

decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('web',
 tensor([[0.0846, 0.1002, 0.2111, 0.1338, 0.1122, 0.1033, 0.0827, 0.1723]],
        device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [76]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('web',
 tensor([[0.0846, 0.1002, 0.2111, 0.1338, 0.1122, 0.1033, 0.0827, 0.1723]],
        device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [77]:
decoder_input = torch.tensor([[top_index.item()]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('padme',
 tensor([[0.1301, 0.0499, 0.0778, 0.2633, 0.1053, 0.0535, 0.2131, 0.1069]],
        device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [78]:
decoder_input = torch.tensor([[top_index.item()]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('web',
 tensor([[0.0849, 0.0824, 0.0841, 0.1227, 0.0569, 0.2101, 0.0619, 0.2970]],
        device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [79]:
for i in range(6):
  decoder_input = torch.tensor([[output_indices[i]]], device=device)
  decoder_hidden = hidden
  output_size = output_lang.n_words
  embedded = embedding(decoder_input)
  attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
  attn_weights = F.softmax(attn_weights, dim = 1)
  attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
  input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
  input_to_gru = input_to_gru.unsqueeze(0)
  output, decoder_hidden = gru(input_to_gru, decoder_hidden)
  output = F.relu(output)
  output = F.softmax(output_word_layer(output[0]), dim = 1)
  top_value, top_index = output.data.topk(1)
  print(output_sentence.split(" ")[i], output_indices[i], output_lang.index2word[top_index.item()], top_index.item() )
  print(attn_weights)

how 14 herbivores 9405
tensor([[0.1943, 0.0933, 0.1412, 0.1461, 0.0610, 0.0953, 0.0836, 0.1852]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
can 92 nurse 6950
tensor([[0.1176, 0.2191, 0.1064, 0.0899, 0.1347, 0.0784, 0.0952, 0.1586]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
one 93 web 8
tensor([[0.1685, 0.1060, 0.1135, 0.1128, 0.0928, 0.1412, 0.1397, 0.1256]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
get 182 web 8
tensor([[0.1106, 0.1799, 0.0790, 0.0882, 0.1830, 0.0798, 0.0606, 0.2188]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
insanely 5355 web 8
tensor([[0.1810, 0.0865, 0.1350, 0.1158, 0.1060, 0.0913, 0.0912, 0.1931]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
rich 763 web 8
tensor([[0.1424, 0.0742, 0.1629, 0.1136, 0.0589, 0.0613, 0.2326, 0.1541]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)


In [80]:
output_indices, output_sentence, input_sentence

([14, 92, 93, 182, 5355, 763, 6, 1],
 'how can one get insanely rich ?',
 'can i be rich ?')

In [81]:
%matplotlib inline

In [82]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [83]:
device

device(type='cuda')

In [84]:
!wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv


--2022-01-08 16:40:43--  http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
Resolving qim.fs.quoracdn.net (qim.fs.quoracdn.net)... 151.101.1.2, 151.101.65.2, 151.101.129.2, ...
Connecting to qim.fs.quoracdn.net (qim.fs.quoracdn.net)|151.101.1.2|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58176133 (55M) [text/tab-separated-values]
Saving to: ‘quora_duplicate_questions.tsv.1’


2022-01-08 16:40:44 (219 MB/s) - ‘quora_duplicate_questions.tsv.1’ saved [58176133/58176133]



In [85]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [86]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [87]:
def readLangs(lang1, lang2, filename,reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    df = pd.read_csv(filename,sep='\t')
    df = df.dropna()
    df = df[['question1','question2']]
    for col in df.columns:
        # normalize
        df[col] =df[col].str.strip()
        df[col] = df[col].apply(normalizeString)

    pairs = df.values.tolist()
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [88]:
MAX_LENGTH = 8

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [91]:
def prepareData(lang1, lang2,filename, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, filename,reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, ':',input_lang.n_words)
    print(output_lang.name, ':',output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('Question Set1', 'Question Set2','quora_duplicate_questions.tsv', True)
print(random.choice(pairs))

Reading lines...
Read 404287 sentence pairs
Trimmed to 28944 sentence pairs
Counting words...
Counted words:
Question Set2 : 13523
Question Set1 : 14013
['what is iterative algorithm ?', 'what is algorithms ?']


In [92]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [93]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output, hidden = self.gru(output, hidden)
        output = F.relu(output)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [94]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [95]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [96]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))



In [97]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [98]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [99]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [100]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [101]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

3m 8s (- 43m 59s) (5000 6%) 4.4996
6m 10s (- 40m 8s) (10000 13%) 4.1008
9m 13s (- 36m 52s) (15000 20%) 4.0009
12m 15s (- 33m 41s) (20000 26%) 3.8864
15m 16s (- 30m 33s) (25000 33%) 3.8156
18m 18s (- 27m 27s) (30000 40%) 3.7528
21m 20s (- 24m 23s) (35000 46%) 3.6840
24m 21s (- 21m 19s) (40000 53%) 3.6471
27m 23s (- 18m 15s) (45000 60%) 3.6257
30m 25s (- 15m 12s) (50000 66%) 3.5872
33m 25s (- 12m 9s) (55000 73%) 3.5529
36m 26s (- 9m 6s) (60000 80%) 3.4820
39m 27s (- 6m 4s) (65000 86%) 3.4756
42m 27s (- 3m 1s) (70000 93%) 3.4799
45m 28s (- 0m 0s) (75000 100%) 3.3926


In [103]:
evaluateRandomly(encoder1, attn_decoder1)

> what do you think of happiness ?
= what do you think the happiness ?
< what do you think about ? <EOS>

> how can i gain weight naturally ?
= how should i gain muscle mass ?
< how do i lose weight ? <EOS>

> was adolf hitler jewish ?
= why was hitler a jew ?
< what is a ? ? ? <EOS>

> what is the power of cph ?
= where can i find cph ?
< what is the ? of <EOS>

> how will immigration change europe ?
= how has immigration changed europe ?
< how does a ? <EOS>

> how do i be stupid ?
= how not to be stupid ?
< how is is a ? ? <EOS>

> what is god ?
= what is god what is god ?
< what is a ? <EOS>

> what are mitochondria ?
= are mitochondria motile ?
< what are the ? <EOS>

> why is bitumen imported to india ?
= can bitumen be imported in india ?
< is is india ? <EOS>

> what are the best guitar faces ?
= what is the best guitar ?
< what is the best ? ? <EOS>

