<a href="https://colab.research.google.com/github/nikshrimali/ENDGAME_MERGER/blob/main/Assignment10/SQUAD_Attention_PADDED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Libraries


In [9]:
# Importing torch and essential libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, TabularDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Currently running on {device}')

import spacy
spacy_en = spacy.load('en')

import numpy as np
import pandas as pd
import os
import random
import math
import time
import json
import random

Currently running on cuda


In [2]:
# Getting the dataset
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

# Getting the test dataset
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

--2021-01-13 02:09:52--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.109.153, 185.199.108.153, 185.199.111.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2021-01-13 02:09:52 (225 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]

--2021-01-13 02:09:52--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.109.153, 185.199.108.153, 185.199.111.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘dev-v2.0.json’


2021-01-13 02:09:52 (73.1 MB/s) - ‘dev-v2.0.json’ saved [4370528/4370528]



In [3]:
# Setting seeds for reproducability

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Loading Json and formatting

In [4]:
with open("train-v2.0.json") as f:
    train_dict = json.load(f)

with open("/content/dev-v2.0.json") as f:
    test_dict = json.load(f)

In [6]:
def json_to_df(json_dict):
    '''
    Takes input as a dictionary and returns a dataframe of columns InputData and Answer
    Currently returns the dataframe upto 10k rows due to storage constraints
    '''
    df = pd.DataFrame(columns=['InputData', 'Answer'])
    df_idx = 0
    for topic in json_dict["data"]:
        for pgraph in topic["paragraphs"]:
            
            for index, qa in enumerate(pgraph["qas"]):
                if not qa["is_impossible"]:
                    text = pgraph["context"]
                    question = qa["question"]
                    df.at[df_idx, 'InputData'] = "[CLS] " + question + " [SEP] " + text + " [SEP]"
                    df.at[df_idx, 'Answer'] = qa["answers"][0]['text']
                    df_idx += 1
                
    return df[:10000]

In [7]:
def get_pandas_data():

    '''Reads the pandas data if already exists'''

    if not os.path.exists('/content/train_data.csv'):
        train_data = json_to_df(train_dict)
        test_data = json_to_df(test_dict)
        train_data.to_csv('train_data.csv', index=False)
        test_data.to_csv('test_data.csv', index=False)
    else:
        train_data = pd.read_csv('/content/train_data.csv')
        test_data = pd.read_csv('/content/test_data.csv')
    return train_data, test_data

In [8]:
train_data, test_data = get_pandas_data()

In [17]:
# Lets see what our output looks like
print(train_data.head(10))
print(test_data.head(10))

TypeError: ignored

## Converting the dataset into processable format

In [10]:
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

### Field is like a tuple that converts the data into SRC and TRG format

In [12]:
SRC = Field(tokenize= tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

TRG = Field(tokenize = tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

fields = {'InputData': ('q', SRC), 'Answer': ('t', TRG)}

In [13]:
train_data, test_data = TabularDataset.splits(
                                path = '',   
                                train = 'train_data.csv',
                                test = 'test_data.csv',
                                format = 'csv',
                                fields = fields)

In [14]:
SRC.build_vocab(train_data, min_freq = 2, max_size= 10000)
TRG.build_vocab(train_data, min_freq = 2, max_size= 5000)

In [15]:
BATCH_SIZE = 24

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE,
    sort=False,
    device = device)

# Modellling

## Attention Mechanism 

It will primarily have 3 components, first is Encoder, decoder, attention block and then connecting all this into a sequence is a SEQ-to-SEQ block of code

## Encoder

This block takes inputs of a particular size which is of dimenstion of vocab, takes hidden dimension, embedding dimension as we are training the embedding as well. It is a bi-directional GRU block, hence it outputs would be an hidden state and outputs

## Decoder 

This block takes inputs from the encoder and the attention block. This is also a bi-directional GRU block which will have a linear layer attached along with it.

## Attention block

This block takes hidden state of encoder and also takes the hidden state of the decoder, and generates a similarity score between them, which helps decoder to focus on a particular section of the code rather than all of it.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, enc_hidden_dim, dec_hid_dim dropout_size):
        super().__init__()

        # Embedding Hyperparameters
        # num_embeddings = Size of your input of your vocab
        # embedding dim = Size of your embeddings dimension

        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        # From here we send our embeddings to a RNN which then generates output

        # Dropout is used in some layers of embeddings and hidden states
        self.dropout = nn.Dropout(self.dropout_size)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=enc_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim) # enc_hid_dim * 2 because the nn is bidirectional in nature

    def forward(self, input_src):
        # src = [src_len, batch_size]
        embedded_data = self.dropout(self.embedding(input_src))
        # embedded = [src_len, batch_size, embedding_dim]
        output, hidden = self.rnn(embedded_data)

        # output = [src_len, batch_size, hidden_dim*num_directions]
        # hidden = [n_layers*num_direction, batch_size, hidden_dim]
        

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:], dim=1))))
        # hidden_dim = [batch_size, dec hid dim]
        return outputs, hidden


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim,enc_hidden_dim, dec_hidden_dim, dropout_size, attention):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=output_dim, embedding_dim=embedding_dim)
        # Output shape of embedding_dim is input_size, embedding_dim
        self.dropout = nn.Dropout(dropout_size)
        self.rnn = nn.GRU(input_size=embedding_dim, bidirectional=False, n_layers=1, hidden_size=dec_hidden_dim)
        # Output shape of GRU is hidden is of shape - batch_size, input_dim, hidden_dim when unidirectional
        # Linear layers takes input from embedding layers, attention block, 
        self.linear = nn.Linear(in_features=(embedding_dim + enc_hidden_dim*2 + dec_hidden_dim), output_dim)
        # Output shape of linear  layers is concat of all(input, hidden, embedding)

    def forward(self, output_trg):

        embedded_data = self.dropout(self.embedding(output_trg))
        output, hidden = self.rnn(embedded_data)

        # Hidden is stacked forwards and backwards


In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()

        # Attention is basically a dot product between the outputs from the encoder and output from the decoder
        self.attn = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch_size, dec hid dim]
        # encoder_outputs = [src_len, batch_size, enc_hid_dim*2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1,0,2)
        energy = torch.tanh(self.attn(torch.cat(hidden, encoder_outputs), dim=2))

