<a href="https://colab.research.google.com/github/nikshrimali/ENDGAME_MERGER/blob/main/Assignment10/SQUAD_Attention_PADDED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Libraries


In [54]:
# Importing torch and essential libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, TabularDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Currently running on {device}')

import spacy
spacy_en = spacy.load('en')

import numpy as np
import pandas as pd
import os
import random
import math
import time
import json
import random

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s')
logger = logging.getLogger('SQUAD-PADDED_ATTN')

Currently running on cuda


In [55]:
# Getting the dataset
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

# Getting the test dataset
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

--2021-01-21 17:26:34--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json.1’


2021-01-21 17:26:34 (228 MB/s) - ‘train-v2.0.json.1’ saved [42123633/42123633]

--2021-01-21 17:26:34--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘dev-v2.0.json.1’


2021-01-21 17:26:34 (110 MB/s) - ‘dev-v2.0.json.1’ saved [4370528/4370528]



In [56]:
# Setting seeds for reproducability

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Loading Json and formatting

In [57]:
with open("train-v2.0.json") as f:
    train_dict = json.load(f)

with open("/content/dev-v2.0.json") as f:
    test_dict = json.load(f)

In [58]:
def json_to_df(json_dict):
    '''
    Takes input as a dictionary and returns a dataframe of columns InputData and Answer
    Currently returns the dataframe upto 10k rows due to storage constraints
    '''
    df = pd.DataFrame(columns=['InputData', 'Answer'])
    df_idx = 0
    for topic in json_dict["data"]:
        for pgraph in topic["paragraphs"]:
            
            for index, qa in enumerate(pgraph["qas"]):
                if not qa["is_impossible"]:
                    text = pgraph["context"]
                    question = qa["question"]
                    df.at[df_idx, 'InputData'] = "[CLS] " + question + " [SEP] " + text + " [SEP]"
                    df.at[df_idx, 'Answer'] = qa["answers"][0]['text']
                    df_idx += 1
                
    return df[:1000]

In [59]:
def get_pandas_data():

    '''Reads the pandas data if already exists'''

    if not os.path.exists('/content/train_data.csv'):
        train_data = json_to_df(train_dict)
        test_data = json_to_df(test_dict)
        train_data.to_csv('train_data.csv', index=False)
        test_data.to_csv('test_data.csv', index=False)
    else:
        train_data = pd.read_csv('/content/train_data.csv')
        test_data = pd.read_csv('/content/test_data.csv')
    return train_data, test_data

In [60]:
train_data, test_data = get_pandas_data()

In [61]:
# Lets see what our output looks like
print(train_data.head(10))
print(test_data.head(10))

                                           InputData               Answer
0  [CLS] When did Beyonce start becoming popular?...    in the late 1990s
1  [CLS] What areas did Beyonce compete in when s...  singing and dancing
2  [CLS] When did Beyonce leave Destiny's Child a...                 2003
3  [CLS] In what city and state did Beyonce  grow...       Houston, Texas
4  [CLS] In which decade did Beyonce become famou...           late 1990s
5  [CLS] In what R&B group was she the lead singe...      Destiny's Child
6  [CLS] What album made her a worldwide known ar...  Dangerously in Love
7  [CLS] Who managed the Destiny's Child group? [...       Mathew Knowles
8  [CLS] When did Beyoncé rise to fame? [SEP] Bey...           late 1990s
9  [CLS] What role did Beyoncé have in Destiny's ...          lead singer
                                           InputData                       Answer
0  [CLS] In what country is Normandy located? [SE...                       France
1  [CLS] When were the

## Converting the dataset into processable format

In [62]:
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

### Field is like a tuple that converts the data into SRC and TRG format

Include_lengths = True set for padded sequence

In [63]:
SRC = Field(tokenize= tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True,
            include_lengths = True)

TRG = Field(tokenize = tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

fields = {'InputData': ('q', SRC), 'Answer': ('t', TRG)}

In [64]:
train_data, test_data = TabularDataset.splits(
                                path = '',   
                                train = 'train_data.csv',
                                test = 'test_data.csv',
                                format = 'csv',
                                fields = fields)

In [65]:
SRC.build_vocab(train_data, min_freq = 2, max_size= 10000)
TRG.build_vocab(train_data, min_freq = 2, max_size= 5000)

In [66]:
BATCH_SIZE = 24

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE,
    sort=True,
    sort_key = lambda x: len(x.q),
    device = device)

In [67]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, enc_hidden_dim, dec_hidden_dim, dropout_size):
        super().__init__()

        # Embedding Hyperparameters
        # num_embeddings = Size of your input of your vocab
        # embedding dim = Size of your embeddings dimension
        
        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        # From here we send our embeddings to a RNN which then generates output

        # Dropout is used in layers of embeddings and hidden states
        self.dropout = nn.Dropout(dropout_size)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=enc_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hidden_dim*2, dec_hidden_dim) # enc_hid_dim * 2 because the nn is bidirectional in nature
        #Why fc output of fc layer is equal to the dec_hid_dim

    def forward(self, input_src, src_len):
        # src = [src_len, batch_size]
        logger.debug('$$$$$$$$$$$$ Encoder Logs Begins $$$$$$$$$$$$$$$')
        logger.debug(f'Shape of the input dim is [src_len, batch_size] -  {input_src.shape}')
        embedded_data = self.dropout(self.embedding(input_src))
        logger.debug(f'Shape of the embedding dim is [src_len, batch_size, embedded_dim] -  {embedded_data.shape}')
        # embedded = [src_len, batch_size, embedding_dim]

        # src_len = torch.as_tensor([24], dtype=torch.int64)
        # print('src len=', input_src[0].cpu())
        # src_len = src_len.unsqueeze(0)
        logger.debug(f'src_len = {src_len}')
        packed_embedding = nn.utils.rnn.pack_padded_sequence(embedded_data, src_len.cpu())
        packed_outputs, hidden = self.rnn(packed_embedding)

        # Add padding back to the output to add gpu processing
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)

        logger.debug(f'Shape of the output from RNN is [src_len, batch_size, hidden_dim*num_directions] -  {outputs.shape}')
        logger.debug(f'Shape of the hidden from RNN is [src_len, batch_size, hidden_dim*num_directions] -  {hidden.shape}')

        # output = [src_len, batch_size, hidden_dim*num_directions]
        # hidden = [n_layers*num_direction, batch_size, hidden_dim]
        

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))

        logger.debug(f'Shape of the hidden after bidirection concat is -  {hidden.shape}')
        
        logger.debug('$$$$$$$$$$$$ Encoder Logs Ends $$$$$$$$$$$$$$$')
        # hidden_dim = [batch_size, dec hid dim]
        return outputs, hidden

In [68]:
# Attention mechanism

class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()

        # Attention is basically a dot product between the outputs from the encoder and output from the decoder
        self.attn = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask):
        # hidden = [batch_size, dec hid dim]
        # encoder_outputs = [src_len, batch_size, enc_hid_dim*2]
        logger.debug('######### Attention Log - Begins#############')
        logger.debug(f'shape of encoder outputs {encoder_outputs.shape}')
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        # Repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        logger.debug(f'shape of hidden after unsqueezing source lenght times {hidden.shape}')

        encoder_outputs = encoder_outputs.permute(1,0,2)

        # energy = [batch size, src len, enc_hidden_dim*2]
        # energy is concat of encoder output 
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))

        logger.debug(f'Shape of energy {energy.shape}')

        attention = self.v(energy).squeeze(2)

        # Fill the attention values of masked fill to very small values
        logger.debug(f' Shape of attention after squeeze(2) {attention.shape}')
        attention = attention.masked_fill(mask==0, -1e10)

        logger.debug(f'Shape of attention after masked fill:', attention.shape)
        # attention = [batch size, src len]

        logger.debug('################## Attention Logs Ends###############################')

        return F.softmax(attention, dim=1) # Get softmax outputs on dim = 1

In [69]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim,enc_hidden_dim, dec_hidden_dim, dropout_size, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        

        self.embedding = nn.Embedding(num_embeddings=output_dim, embedding_dim=embedding_dim)

        # Output shape of embedding_dim is input_size, embedding_dim

        self.dropout = nn.Dropout(dropout_size)

        # Why enc_hidden_dim*2 - Input of attention is going into decoder rnn's along with embedding dim, and previous hidden state
        self.rnn = nn.GRU(input_size=(enc_hidden_dim*2 + embedding_dim), bidirectional=False, hidden_size=dec_hidden_dim)

        # Output shape of GRU is hidden is of shape - batch_size, input_dim, hidden_dim when unidirectional
        # Linear layers takes input from embedding layers, attention block, 

        self.fc_out = nn.Linear(in_features=(embedding_dim + enc_hidden_dim*2 + dec_hidden_dim), out_features= output_dim)

        # Output shape of linear  layers is concat of all(input, hidden, embedding)

    def forward(self, input, hidden, encoder_outputs, mask):

        # input = [batch_size]
        # hidden = [batch_size, dec_hidden_dim]
        # encoder_outputs = [src_len, batch_size, enc_hid_dim*2]

        logger.debug('&&&&&&&&&&&&&&& Decoder Logs Begins &&&&&&&&&&&&&&')
        logger.debug(f'Input before unsqueeeze {input}, after unsqueeze {input.unsqueeze(0)}')

        input = input.unsqueeze(0)
        # input = [1, batch_size]

        embedded_data = self.dropout(self.embedding(input))
        # embedded_data = [1, batch_size, embedding_dim]

        a = self.attention(hidden, encoder_outputs, mask)
        # attention = [batch_size, src_len]

        # Attention
        logger.debug('Decoder - attention shape before permute and  unsqueeze(1)', a.shape)    
        a = a.unsqueeze(1)
        # attention = [batch size, 1, src_len]
        logger.debug('Decoder - attention shape after unsqueeze(1)', a.shape)

        # Encoder Outputs
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch_size, src_len, enc_hid_dim*2]

        # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor.
        logger.debug(f'Attention shape {a.shape} encoder_output shape {encoder_outputs.shape}')
        weighted = torch.bmm(a, encoder_outputs).permute(1, 0, 2)
        # weighted = [24, 1, 80] * [24, 80, 1024] = [24, 80, 1024] = [80, 24, 1024]
        logger.debug(f'shape of weighted - {weighted.shape}')

        outputs, hidden = self.rnn(torch.cat((embedded_data, weighted), dim=2), hidden.unsqueeze(0))
        # output = [1, batch_size, dec_hid_dim]
        # hidden = [1, batch_size, dec_hid_dim]


        fc_input = torch.cat((embedded_data, outputs, weighted), dim=2)
        # fc_input = [1, batch_size, emb_dim + dec_hid_dim + enc_hid_dim*2]

        predictions = self.fc_out(fc_input).squeeze(0)
        # predictions -> [batch_size, output_dim]
         
        # Hidden is stacked forwards and backwards
        logger.debug(f'predictions - shape {predictions.shape}')
        logger.debug('&&&&&&&&&&&&&&& Decoder Logs Ends &&&&&&&&&&&&&&')

        return predictions, hidden.squeeze(0), a.squeeze(1)
        # predictions = [batch_size, output_dim]
        # hidden = [batch_size, dec_hid_dim]
        # a = [batch_size, src_len]

In [70]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, src_pad_idx):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.src_pad_idx =  src_pad_idx
    
    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1,0)
        return mask

    def forward(self, src, trg, src_len ,teacher_forcing=0.5):
        # src = [src len, batch_size]
        # trg = [trg_len, batch_size]

        batch_size = src.shape[1]
        trg_len = trg.shape[0]

        # Decoder output dim what is?
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # encoder outputs is all hidden states of the input sequence

        encoder_outputs, hidden = self.encoder(src, src_len)

        # first input is <sos> token
        input = trg[0,:]
        for t in range(1,trg_len):
            mask = self.create_mask(src)
            output, hidden, attn = self.decoder(input, hidden, encoder_outputs, mask)

            outputs[t] = output

            teacher_force = random.random() < teacher_forcing

            # Highest predicted token from predictions
            top1 = output.argmax(1)

            # if teacher forcing, use actual next token as input
            input = trg[t] if teacher_force else top1
        return outputs

## Training the Seq2Seq model


In [71]:

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
            # input_dim, embedding_dim, enc_hidden_dim, dec_hidden_dim, dropout_size)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
                # output_dim, embedding_dim,enc_hidden_dim, dec_hidden_dim, dropout_size, attention

model = Seq2Seq(enc, dec, device, SRC_PAD_IDX).to(device)

In [72]:
# weights and biases initialized to 0

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data,0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(2918, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(502, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=502, bias=True)
  )
)

In [73]:
optimizer = optim.Adam(model.parameters())

In [74]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [79]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train() 
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):

        # print('batch',batch)
        
        src, src_len = batch.q
        trg = batch.t        
        optimizer.zero_grad()        
        output = model(src, trg, src_len)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [76]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src, src_len = batch.q
            trg = batch.t

            output = model(src, trg, src_len) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [77]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [80]:
import tqdm
N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    print('Training started')
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    print('training complete')
    valid_loss = evaluate(model, test_iterator, criterion)
    

    end_time = time.time()
    print(end_time)
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Training started
training complete
1611250053.286074
Epoch: 01 | Time: 0m 6s
	Train Loss: 3.917 | Train PPL:  50.234
	 Val. Loss: 1.885 |  Val. PPL:   6.586
Training started
training complete
1611250059.6753592
Epoch: 02 | Time: 0m 6s
	Train Loss: 3.646 | Train PPL:  38.327
	 Val. Loss: 2.250 |  Val. PPL:   9.486
Training started
training complete
1611250065.9895358
Epoch: 03 | Time: 0m 6s
	Train Loss: 3.601 | Train PPL:  36.638
	 Val. Loss: 2.214 |  Val. PPL:   9.149
Training started
training complete
1611250072.3503861
Epoch: 04 | Time: 0m 6s
	Train Loss: 3.773 | Train PPL:  43.521
	 Val. Loss: 2.112 |  Val. PPL:   8.265
Training started
training complete
1611250078.7238147
Epoch: 05 | Time: 0m 6s
	Train Loss: 3.455 | Train PPL:  31.651
	 Val. Loss: 2.118 |  Val. PPL:   8.315
Training started
training complete
1611250085.0924964
Epoch: 06 | Time: 0m 6s
	Train Loss: 3.421 | Train PPL:  30.602
	 Val. Loss: 2.174 |  Val. PPL:   8.798
Training started
training complete
1611250091.4957643

In [81]:

model.load_state_dict(torch.load('tut3-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 1.883 | Test PPL:   6.575 |


## Inferencing the model

In [None]:
import matplotlib.pyplot as plt
from matplotlib import ticker

def display_attention(question, answer, attention):

  fig = plt.figure(figsize=(10, 10))
  ax = fig.add_subplot(111)

  attention = attention.squeeze(1).cpu().detach().numpy()

  cax = ax.matshow(attention, cmap='bone')

  ax.tick_params(labelsize=15)
  ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentence]+['<eos>'], rotation=45)
  ax.set_yticklabels(['']+translation)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()
  plt.close()