<a href="https://colab.research.google.com/github/nrajmalwar/END2.0/blob/main/Session_07/Part2_Seq2Seq_Datasets/Question_Answer_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Check GPU

In [1]:
!nvidia-smi

Thu Jun 24 14:06:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Import Libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, BucketIterator,TabularDataset

import spacy
import numpy as np
import pandas as pd
import glob

import random
import math
import time

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Use GPU as device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

Device:  cuda


In [3]:
%%bash
python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


# Data Preprocessing

In [4]:
!wget http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz

--2021-06-24 14:06:19--  http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8254496 (7.9M) [application/x-gzip]
Saving to: ‘Question_Answer_Dataset_v1.2.tar.gz’


2021-06-24 14:06:52 (247 KB/s) - ‘Question_Answer_Dataset_v1.2.tar.gz’ saved [8254496/8254496]



In [5]:
!ls

Question_Answer_Dataset_v1.2.tar.gz  sample_data


In [6]:
# extract files
!tar -xvzf Question_Answer_Dataset_v1.2.tar.gz

Question_Answer_Dataset_v1.2/
Question_Answer_Dataset_v1.2/S08/
Question_Answer_Dataset_v1.2/S08/question_answer_pairs.txt
Question_Answer_Dataset_v1.2/S08/data/
Question_Answer_Dataset_v1.2/S08/data/set4/
Question_Answer_Dataset_v1.2/S08/data/set4/a6.txt.clean
Question_Answer_Dataset_v1.2/S08/data/set4/a3.txt.clean
Question_Answer_Dataset_v1.2/S08/data/set4/a3.txt
Question_Answer_Dataset_v1.2/S08/data/set4/a5.txt
Question_Answer_Dataset_v1.2/S08/data/set4/a4o.htm
Question_Answer_Dataset_v1.2/S08/data/set4/a3.htm
Question_Answer_Dataset_v1.2/S08/data/set4/a9.htm
Question_Answer_Dataset_v1.2/S08/data/set4/a2.txt
Question_Answer_Dataset_v1.2/S08/data/set4/a9.txt.clean
Question_Answer_Dataset_v1.2/S08/data/set4/a4.htm
Question_Answer_Dataset_v1.2/S08/data/set4/a4.txt
Question_Answer_Dataset_v1.2/S08/data/set4/a4.txt.clean
Question_Answer_Dataset_v1.2/S08/data/set4/a2.htm
Question_Answer_Dataset_v1.2/S08/data/set4/a7o.htm
Question_Answer_Dataset_v1.2/S08/data/set4/a6.txt
Question_Answer_Da

In [7]:
path1 = 'Question_Answer_Dataset_v1.2/S08/question_answer_pairs.txt'
path2 = 'Question_Answer_Dataset_v1.2/S09/question_answer_pairs.txt'
path3 = 'Question_Answer_Dataset_v1.2/S10/question_answer_pairs.txt'

all_files = [path1, path2, path3]
df = []

# read from each path
for filename in all_files:
  data = pd.read_csv(filename, index_col=None, header=0, sep='\t', encoding='ISO-8859-1')
  print(f'Filename: {filename}')
  print(f'Shape: {data.shape}\n')
  df.append(data)

Filename: Question_Answer_Dataset_v1.2/S08/question_answer_pairs.txt
Shape: (1715, 6)

Filename: Question_Answer_Dataset_v1.2/S09/question_answer_pairs.txt
Shape: (825, 6)

Filename: Question_Answer_Dataset_v1.2/S10/question_answer_pairs.txt
Shape: (1458, 6)



In [8]:
# Merge all data
data_all = pd.concat(df, axis=0, ignore_index=True)
data_all.shape

(3998, 6)

In [9]:
# Print 5 random samples
data_all.sample(5)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
268,Calvin_Coolidge,What Retrieved on May?,,,,data/set3/a9
2944,Drum,Is the drum a member of the percussion group?,yes,easy,easy,data/set2/a4
3555,Nikola_Tesla,Was Nikola Tesla a vegetarian?,Yes,hard,easy,data/set4/a3
1619,Uruguay,Is uruguay's landscape mountainous?,not really?,hard,too hard,data/set2/a9
3716,San_Francisco,How large is the population of San Francisco?,San Francisco has an estimated population of 8...,medium,medium,data/set3/a8


## Remove null or Nan  records from Question and Answer 

In [10]:
# Identify NaN values
data_all.isnull().sum()

ArticleTitle                  0
Question                     37
Answer                      576
DifficultyFromQuestioner    955
DifficultyFromAnswerer      580
ArticleFile                   2
dtype: int64

In [11]:
# Drop rows where Question or Answer is missing
data_all.dropna(subset=['Question', 'Answer'], inplace=True)
data_all.isnull().sum()

ArticleTitle                  0
Question                      0
Answer                        0
DifficultyFromQuestioner    688
DifficultyFromAnswerer        5
ArticleFile                   2
dtype: int64

In [12]:
# save as .tsv file
data_all.to_csv("Final_data.tsv", sep='\t')

In [13]:
# Tokenizer function
spacy_en = spacy.load('en_core_web_sm') 

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [14]:
data_all.sample(5)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
395,Egypt,Is Egypt the most populated country in Africa?,no,easy,medium,data/set2/a6
865,James_Monroe,"What dwindled and eventually died out, startin...",The Federalist Party,,easy,data/set3/a2
3504,Montreal,What is the name of the largest church in Mont...,The largest church in Montreal is named Saint ...,medium,hard,data/set3/a7
2616,Amedeo_Avogadro,Is Amedeo Avogadro Italian?,Yes,hard,easy,data/set4/a8
3275,Korean_language,How many verb paradigms are there in Korean?,There are seven verb paradigms or speech level...,easy,easy,data/set5/a6


## Field Variables

In [15]:
SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

# Question and Answer correspond to columns in the .tsv file
fields = {'Question' : ('src', SRC),
          'Answer' : ('trg', TRG)}

## Create Dataset

In [16]:
qa_data  = TabularDataset(path='Final_data.tsv',
                          format='tsv',
                          fields=fields
                           )

In [17]:
print("Example of the dataset:", vars(qa_data.examples[1868]))

Example of the dataset: {'src': ['did', 'newton', 'reject', 'the', 'church', "'s", 'doctrine', 'of', 'the', 'trinity', '?'], 'trg': ['newton', 'may', 'have', 'rejected', 'the', 'church', "'s", 'doctrine', 'of', 'the', 'trinity', '.']}


## Train Test split

In [18]:
# Train-test split
train_data, test_data = qa_data.split([0.7,0.3])

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 2395
Number of testing examples: 1027


In [19]:
print("Example of training set:", vars(train_data.examples[0]))

Example of training set: {'src': ['how', 'does', 'poverty', 'in', 'san', 'francisco', 'compare', 'to', 'the', 'nation', '-', 'wide', 'average', '?'], 'trg': ['san', 'francisco', "'s", 'poverty', 'rate', 'is', 'lower', 'than', 'the', 'national', 'average', '.']}


In [20]:
print("Example of testing set:", vars(test_data.examples[0]))

Example of testing set: {'src': ['where', 'is', 'the', 'most', 'densely', 'populated', 'part', 'of', 'canada', '?'], 'trg': ['the', 'most', 'densely', 'populated', 'part', 'of', 'the', 'country', 'is', 'the', 'quebec', 'city', '-', 'windsor', 'corridor', 'along', 'the', 'great', 'lakes', 'and', 'saint', 'lawrence', 'river', 'in', 'the', 'southeast', '.']}


## Build Vocab

In [21]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 1979
Unique tokens in target (en) vocabulary: 1280


## Create iterator variables

In [22]:
BATCH_SIZE = 128

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.src),
    sort_within_batch=True,
    device = device)

# Seq2Seq Architecture

In [23]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [24]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [25]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

# Model instance

In [26]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [27]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1979, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(1280, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=1280, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [28]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 8,847,360 trainable parameters


# Optimizer and Loss Function

In [29]:
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

# Train and Evaluation Functions

In [30]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [31]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [32]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Model Training

In [33]:
N_EPOCHS = 10
CLIP = 1

best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), 'qa1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Test. Loss: {test_loss:.3f} |  Test. PPL: {math.exp(test_loss):7.3f}')

Epoch: 01 | Time: 0m 6s
	Train Loss: 5.203 | Train PPL: 181.762
	 Test. Loss: 3.512 |  Test. PPL:  33.522
Epoch: 02 | Time: 0m 5s
	Train Loss: 4.407 | Train PPL:  82.061
	 Test. Loss: 3.494 |  Test. PPL:  32.932
Epoch: 03 | Time: 0m 5s
	Train Loss: 4.311 | Train PPL:  74.526
	 Test. Loss: 3.479 |  Test. PPL:  32.411
Epoch: 04 | Time: 0m 5s
	Train Loss: 4.247 | Train PPL:  69.867
	 Test. Loss: 3.460 |  Test. PPL:  31.826
Epoch: 05 | Time: 0m 5s
	Train Loss: 4.195 | Train PPL:  66.372
	 Test. Loss: 3.428 |  Test. PPL:  30.814
Epoch: 06 | Time: 0m 5s
	Train Loss: 4.118 | Train PPL:  61.448
	 Test. Loss: 3.459 |  Test. PPL:  31.783
Epoch: 07 | Time: 0m 5s
	Train Loss: 4.025 | Train PPL:  55.956
	 Test. Loss: 3.301 |  Test. PPL:  27.128
Epoch: 08 | Time: 0m 5s
	Train Loss: 3.989 | Train PPL:  53.994
	 Test. Loss: 3.303 |  Test. PPL:  27.189
Epoch: 09 | Time: 0m 5s
	Train Loss: 3.902 | Train PPL:  49.488
	 Test. Loss: 3.339 |  Test. PPL:  28.201
Epoch: 10 | Time: 0m 6s
	Train Loss: 3.868 | T

# Model Evaluation

In [47]:
model.load_state_dict(torch.load('qa1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.260 | Test PPL:  26.052 |
