# 2023 CITS4012 Assignment
*Make sure you change the file name with your student id.*

# Readme
*If there is something to be noted for the marker, please mention here.* 

*If you are planning to implement a program with Object Oriented Programming style, please check the bottom of the this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [81]:
# Necessary Library Imports
import pandas as pd
import numpy as np
import nltk
import re
import torch
import time
import torch.nn.functional as F

from torch import nn
from nltk.tag import pos_tag
from nltk import word_tokenize

nltk.download('punkt')

# Read in data files
test_df = pd.read_csv("WikiQA-test.tsv", sep="\t")
train_df = pd.read_csv("WikiQA-train.tsv", sep="\t")

# TODO: ask about Word Match Feature: check whether the word appears in the question by using decapitalisation or lemmatization - how can we extract that for the question inputs
# TODO: ask about NER tagging 3g and 4g as (3, g) and (4, g)
# TODO: solve if word not in GloVe model

# notes: 

# spacy word tokenizer can be used for the case of tagging 3g and 4g as (3, g) and (4, g), however it is far less
# efficient - the data wrangling function takes about 210 seconds to run instead of 5 seconds when using spacy tokenize

# phrases like 1570–1638 get converted to 15701638 after punctuation removal 

# it would be better to predict which sentence is the answer to the question since that is how the data ia stored - sentences are tagged as the answer
# our model removes the concept of sentences by removing full stops and representing the whole document as a continuous string of tokens
# an improvement would be to keep the sentences and extract which sentence contains the answer

# improvement could be to remove articles (a, an, the)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Dictionary for Contraction Removal**

In [2]:
contraction_dict = {"ID": "identify", "Im": "i am", "im": "i am", "Dont": "do not", "dont": "do not", "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", 
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", 
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", 
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", 
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", 
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have",
                    "diff'rent": "different", "3g": "3 g", "4g": "4 g"}


def replace_contractions(sentence):
    words = sentence.split()
    replaced_words = []
    for word in words:
        if word in contraction_dict:
            replaced_words.append(contraction_dict[word])
        else:
            replaced_words.append(word)
    replaced_sentence = " ".join(replaced_words).lower()
    return replaced_sentence


def clean(string):
    '''
    Removes puncutation and replaces contractions from a string

    string: string to clean

    return: cleaned string
    '''
    clean_string = re.sub(r'[^\w\s\']', '', string)
    clean_string = replace_contractions(clean_string)
    clean_string = re.sub(r'[^\w\s]', '', string)
    return clean_string

**Vocab Handling**

In [3]:
def create_vocab_list(dfs:list):
    '''
    Creates index2word and word2index dictionaries for encoding from inputted dataframes. Vocab list 
    contains words from dataframe sentences and questions.
    
    dfs: list of dataframes from dataset.

    returns: word2index and index2word dictionaries
    '''
    
    words = set(['[PAD]', '[OOV]'])

    for df in dfs:
        unique_sentences = list(df['Sentence'].unique())
        for sentence in unique_sentences:
            tokens = word_tokenize(clean(sentence.lower()))
            for token in tokens:
                words.add(token)

        unique_questions = list(df['Question'].unique())
        for question in unique_questions:
            tokens = word_tokenize(clean(question.lower()))
            for token in tokens:
                words.add(token)

    word2index = {w: i+2 for i, w in enumerate(words)}
    word2index['[PAD]'] = 1  
    word2index['[OOV]'] = 0  

    index2word = {index:word for word,index in word2index.items()}

    return word2index, index2word

word2index, index2word = create_vocab_list([train_df, test_df])

**Input Encoding**

In [4]:
def encode(tokens, word2index):
    '''
    Converts tokens into respective idsby mapping ach words using word2index. 

    tokens: list of tokens
    word2index: word to index mapping

    returns: list of mapped indexes
    '''
    
    ids = [word2index[word] for word in tokens]

    return ids

**Data Wrangling Function**

In [5]:
# Read in data files
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")

def wrangle_data(df):
    """Handles data and splits dataframe into question, document and answer set

    df: dataframe to split

    return: question, document, and answer dataframe
    """

    # Initialise variables containing all Questions, Documents and Answers
    questions = []
    documents = []
    answers = {}

    # Get unique questions from the dataframe
    unique_questions = df['Question'].drop_duplicates()

    # Iterate through questions dataframe to form the Questions data
    for row in unique_questions:
        
        # Remove punctuation (except apostrophes for contraction removal)
        cleaned_row = re.sub(r'[^\w\s\']','', row)

        # Replace contracted words
        cleaned_row = replace_contractions(cleaned_row)

        # Tokenise the question
        question_tokens = word_tokenize(cleaned_row.lower())

        # Append the question to the list of questions
        questions.append(question_tokens)

    # Iterate through dataframe and form Documents and Answer data
    for index, row in df.iterrows():

        # Initalise current document pairs list and previous doc number (only for first iteration)
        if index == 0:
            previous_doc_num = row['DocumentID']
            current_doc_pairs = []

        # If the current doc number is different to the previous doc number, then a new  
        # document has been found so append the current document pairs to the list of documents     
        # and reinitilise the current document pairs list
        current_doc_num = row['DocumentID']
        if current_doc_num != previous_doc_num:
            documents.append(current_doc_pairs)
            current_doc_pairs = []
        
        # Replace contracted words
        cleaned_row = replace_contractions(row['Sentence'])

        # Remove punctuation
        cleaned_row = re.sub(r'[^\w\s]','', cleaned_row)

        # Tokenise the current row's sentence
        sent_tokens = word_tokenize(cleaned_row.lower())

        # Assign token types according to 'Label' field (1 means the corresponding sentence
        # is the answer/one of the answers)
        # Token types:
        #   0: token a part of the answer
        #   1: start token of the answer
        #   2: inner token of the answer
        #   3: end token of the answer
        if row['Label']:
            # Answer to question is found
            token_types = np.full(shape=len(sent_tokens), fill_value=2)
            token_types[0] = 1
            token_types[-1] = 3

            # Add the tokenised answer to the answers dictionary. If an answer has already been found.
            # add the tokens to the current answer.
            if row['QuestionID'] in answers:
                answers[row['QuestionID']] += sent_tokens 
            else:
                answers[row['QuestionID']] = sent_tokens
        else:
            # Answer not found 
            token_types = np.zeros(len(sent_tokens), dtype=int)

        # Combine token and corresponding types and append to current document list
        token_type_pairs = list(zip(sent_tokens, token_types))
        current_doc_pairs.append(token_type_pairs)

        # Update previous document number 
        previous_doc_num = current_doc_num

        # If the iterated row is the final row in the dataframe, append the current document
        # to the documents array
        if index == len(df)-1:
            documents.append(current_doc_pairs)

    return questions, documents, answers

train_questions, train_documents, train_answers = wrangle_data(train_data)
test_questions, test_documents, test_answers = wrangle_data(test_data)


In [6]:
def wrangle_data(df):
    """
    Handles data wrangling and returns a dataframe giving each question the corresponding document and
    answer span set. Encodes the words into their index of the vocab list as well.

    df: dataframe to wrangle

    return: dataframe containing questions with the corresponding document and answer span set (encoded).
    """

    # Initialise variable containing Answer Spans (questions can have 0, 1, or multiple answers)
    answer_spans = [[0,0]]

    # Intialise df object that will be returned after wrangling
    wrangled_df = pd.DataFrame(columns=['question_id', 'question', 'document', 'answer span'])


    # Iterate through dataframe and form Documents and Answer Span data
    for index, row in df.iterrows():
        # print(index)

        # Initalise current document tokens list, length and previous doc number (only for first iteration)
        if index == 0:
            previous_doc_num = row['DocumentID']
            current_doc_length = 0
            current_doc_tokens = []


        # If the current doc number is different to the previous doc number, then a new  
        # document has been found 
        current_doc_num = row['DocumentID']
        if current_doc_num != previous_doc_num:

            # Remove punctuation from question and replace contractions
            cleaned_qn = clean(previous_qn)

            # Tokenize the question 
            qn_tokens = word_tokenize(cleaned_qn.lower())
                
            for span in answer_spans:
                # Create a new row with multiple values
                # print(current_doc_tokens)
                new_row = {'question_id': previous_qn_id, 
                           'question': [encode(qn_tokens, word2index)],
                           'document':[encode(current_doc_tokens, word2index)],
                           'answer span': [span]}
                
                # Convert the new row to a DataFrame
                new_row_df = pd.DataFrame(new_row)

                # Append the new row DataFrame to the original DataFrame
                wrangled_df = pd.concat([wrangled_df, new_row_df], ignore_index=True)

            # Reinitialise document tokens list, current document length, and answer spans
            current_doc_tokens = []
            current_doc_length = 0
            answer_spans = [[0,0]]


        # Remove punctuation from document sentence and replace contracted words from document sentence 
        cleaned_sent = clean(row['Sentence'])

        # Tokenise the current row's sentence and append to the document tokens list
        sent_tokens = word_tokenize(cleaned_sent.lower())
        current_doc_tokens.extend(sent_tokens)


        if row['Label']:
            # Answer to question is found

            # Find the span of the answer
            span_start = current_doc_length
            span_end = span_start + len(sent_tokens)

            # Add answer span to current document's answer spans
            if answer_spans[0] == [0,0]:
                answer_spans[0] = [span_start, span_end]
            else:
                answer_spans.append([span_start, span_end])
            
            # Increase length of current document
            current_doc_length += len(sent_tokens)

        else:
            # Answer not found 

            # Increase length of current document
            current_doc_length += len(sent_tokens)

        # Update previous document number and question
        previous_doc_num = current_doc_num
        previous_qn_id = row['QuestionID']
        previous_qn = row['Question']

        # If the iterated row is the final row in the dataframe, add the final question and 
        # document to the wrangled df
        if index == len(df)-1:

            # Remove punctuation from question and replaces contractions
            cleaned_qn = clean(previous_qn)

            # Tokenize the question 
            qn_tokens = word_tokenize(cleaned_qn.lower())

            for span in answer_spans:
                # Create a new row with multiple values
                new_row = {'question_id': previous_qn_id,
                           'question': [encode(qn_tokens, word2index)],
                           'document': [encode(current_doc_tokens, word2index)],
                           'answer span': [span]}

                # Convert the new row to a DataFrame
                new_row_df = pd.DataFrame(new_row)

                # Append the new row DataFrame to the original DataFrame
                wrangled_df = pd.concat([wrangled_df, new_row_df], ignore_index=True)

    return wrangled_df

wrangled_train_data = wrangle_data(train_df)
wrangled_test_data = wrangle_data(test_df)

In [7]:
wrangled_train_data

Unnamed: 0,question_id,question,document,answer span
0,Q1,"[19590, 3319, 9670, 2790, 35440]","[31392, 24850, 18324, 9670, 25305, 5351, 26229...","[24, 37]"
1,Q2,"[19590, 3319, 29900, 28422, 262, 29900, 12456,...","[14805, 20959, 28991, 6210, 28748, 31392, 2337...","[0, 0]"
2,Q5,"[19590, 6601, 32946, 6612, 8379]","[32946, 6612, 28748, 31392, 31397, 34404, 2012...","[0, 0]"
3,Q6,"[19590, 3450, 28748, 29900, 19847, 26605, 2854...","[14805, 29900, 30864, 16320, 29900, 16032, 262...","[0, 0]"
4,Q7,"[19590, 31392, 27763, 31216, 39100, 28674, 195...","[29900, 27763, 24076, 7518, 28748, 31392, 3710...","[0, 0]"
...,...,...,...,...
2279,Q3042,"[24617, 17311, 32583, 11304, 15858]","[32583, 5313, 32603, 32583, 11304, 5313, 28748...","[76, 94]"
2280,Q3043,"[34154, 28748, 18284, 25929, 2739]","[18284, 1782, 2739, 14805, 29900, 2403, 7394, ...","[7, 42]"
2281,Q3043,"[34154, 28748, 18284, 25929, 2739]","[18284, 1782, 2739, 14805, 29900, 2403, 7394, ...","[42, 70]"
2282,Q3044,"[34154, 28748, 29900, 37041, 6935, 262, 16356]","[20626, 18291, 6108, 6935, 38051, 17510, 16848...","[0, 0]"


**Data Loader**

In [117]:
import spacy
nlp = spacy.load("en_core_web_sm")

class DataLoader:
    '''
    -Divides the dataframe in batches.
    -Pads the contexts and questions dynamically for each batch by padding 
     the examples to the maximum-length sequence in that batch.
    -Calculates masks for context and question.
    -Calculates spans for contexts.
    '''
    
    def __init__(self, data, batch_size):
        
        self.batch_size = batch_size
        data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
        self.data = data
    
    def get_span(self, text):
        
        text = nlp(text, disable=['parser','tagger','ner'])
        span = [(w.idx, w.idx+len(w.text)) for w in text]

        return span

    def __len__(self):
        return len(self.data)
    
    def __iter__(self):
        '''
        Creates batches of data and yields them.
        
        Each yield comprises of:
        :padded_context: padded tensor of contexts for each batch 
        :padded_question: padded tensor of questions for each batch 
        :context_mask & question_mask: zero-mask for question and context
        :answer_span: start and end index wrt context_ids
        :context_text,answer_text: used while validation to calculate metrics
        :context_spans: spans of context text
        :ids: question_ids used in evaluation
        '''
        
        for batch in self.data:

            # print(batch)
                            
            # spans = []
            context_text = []
            # answer_text = []
            
            max_context_len = max([len(document) for document in batch['document']])
            padded_context = torch.LongTensor(len(batch), max_context_len).fill_(1)
            # print(max_context_len)
            
            for ctx in batch['document']:
                context_text.append(ctx)
                # spans.append(self.get_span(ctx))
            
            # for ans in batch.answer:
            #     answer_text.append(ans)
                
            for i, ctx in enumerate(batch['document']):
                # print(i, len(ctx), ctx)
                padded_context[i, :len(ctx)] = torch.LongTensor(ctx)

            # print(padded_context)
            
            max_question_len = max([len(ques) for ques in batch['question']])
            padded_question = torch.LongTensor(len(batch), max_question_len).fill_(1)
            
            for i, ques in enumerate(batch['question']):
                padded_question[i,: len(ques)] = torch.LongTensor(ques)
                
            
            answer_span = torch.LongTensor(list(batch['answer span']))
            context_mask = torch.eq(padded_context, 1)
            question_mask = torch.eq(padded_question, 1)
            
            qn_ids = list(batch['question_id'])
             
            yield (padded_context, padded_question, context_mask, 
                   question_mask, answer_span, context_text, qn_ids)
            
            # yield (padded_context, padded_question, context_mask, 
            #        question_mask, label, context_text, answer_text, ids)

In [124]:
train_loader = DataLoader(wrangled_train_data, 32)
test_loader = DataLoader(wrangled_test_data, 32)

In [10]:
a = next(iter(test_loader))

In [123]:
for batch in test_loader:
    print(batch[4][0])

tensor([110, 150])
tensor([25, 45])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([124, 148])
tensor([239, 276])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([ 0, 23])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([16, 46])
tensor([0, 0])
tensor([ 0, 21])
tensor([ 0, 29])
tensor([0, 0])
tensor([ 0, 35])
tensor([0, 0])
tensor([ 9, 43])


In [12]:
print(a[4][0])

tensor([110, 150])


In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")

total = 0
for qn in train_questions:
    total += len(qn)
print(total)
train_questions

docs = []
total_2 = 0

for sentence in test_questions:
    text = " ".join(sentence)
    doc = nlp(text)
    docs.append(doc)

for i in range(len(test_questions)):
    # for x in doc:
    #     total_2 += 1
    #     print(total_2, x)
    if len(test_questions[i]) != len(docs[i]):
        print(test_questions[i], len(test_questions[i]))
        print([x for x in docs[i]], len(docs[i]))
    # total_2 += len(train_questions[i])
    # print(total_2)

14725


In [14]:
# for debugging - find all labels equal to 1
train_data.loc[train_data['Label'] == 1]

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
75,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-0,This tablespoon has a capacity of about 15 mL.,1
83,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-8,In the USA one tablespoon (measurement unit) i...,1
84,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-9,In Australia one tablespoon (measurement unit)...,1
98,Q17,how much are the harry potter movies worth,D17,Harry Potter,D17-13,The series also originated much tie-in merchan...,1
...,...,...,...,...,...,...,...
20292,Q3037,What is an economic feature?,D2802,Economics,D2802-9,"At the turn of the 21st century, the expanding...",1
20307,Q3039,what is the average american income,D1876,Household income in the United States,D1876-6,"U.S. median household income fell from $51,144...",1
20325,Q3042,When was Apple Computer founded,D2806,Apple Inc.,D2806-3,"The company was founded on April 1, 1976, and ...",1
20335,Q3043,what is section eight housing,D2807,Section 8 (housing),D2807-1,"Section 8 of the Housing Act of 1937 (), often...",1


# 2.QA Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

**TFIDF (Term Frequency Inverse Document Frequency)**

In [15]:
def calculate_df(tokenized_data):
    """ Calculate term frequencies for a given list of list of tokens

    tokenized_data: tokenized dataset

    return: dictionary of frequencies for each term in the dataset
    """
    DF = {}

    for token_vector in tokenized_data:
        # get each unique word in the doc and count the number of occurences in the document
        for term in np.unique(token_vector):
            try:
                DF[term] += 1
            except:
                DF[term] = 1

    # print scores in descending order
    sorted_dict = sorted(DF.items(), key = lambda x : x[1], reverse = True)
    #print(sorted_dict)
    
    return DF

In [16]:
# calculate TFIDF scores

from collections import Counter
import math

def calculate_tf_idf(tokenized_data):

    tf_idf = {}

    # total number of documents
    n = len(tokenized_data)

    # calculate Document Frequencies
    DF = calculate_df(tokenized_data)

    doc_id = 0
    # get each token vector
    for token_vector in tokenized_data:
        # initialise counter for the vector
        counter = Counter(token_vector)
        #calculate total number of words in the doc
        total_num_words = len(token_vector)

        # get each unique word in the doc
        for term in np.unique(token_vector):

            # calculate Term Frequency
            tf = counter[term]/total_num_words

            # calculate Document Frequency
            df = DF[term]

            # calculate Inverse Document Frequency
            idf = math.log(n/(df+1))+1

            # calcaulte TF-IDF
            tf_idf[doc_id, term] = tf*idf

        doc_id += 1

    return tf_idf

tf_idfs = calculate_tf_idf(train_questions)
tf_idfs


{(0, 'are'): 0.643935192442334,
 (0, 'caves'): 1.5929216181149841,
 (0, 'formed'): 1.315662745891006,
 (0, 'glacier'): 1.5929216181149841,
 (0, 'how'): 0.531268078634743,
 (1, 'a'): 0.21464506414744466,
 (1, 'and'): 0.29300502627290326,
 (1, 'are'): 0.21464506414744466,
 (1, 'circular'): 0.5309738727049946,
 (1, 'directions'): 0.5309738727049946,
 (1, 'force'): 0.4474563414719701,
 (1, 'how'): 0.17708935954491434,
 (1, 'in'): 0.17529845006027778,
 (1, 'motion'): 0.484764060667665,
 (1, 'of'): 0.18878367758362766,
 (1, 'related'): 0.484764060667665,
 (1, 'the'): 0.2693996319800544,
 (1, 'vectors'): 0.5309738727049946,
 (1, 'velocity'): 0.5309738727049946,
 (2, 'apollo'): 1.5929216181149841,
 (2, 'creed'): 1.5929216181149841,
 (2, 'did'): 0.6985938594422703,
 (2, 'die'): 1.11334256355531,
 (2, 'how'): 0.531268078634743,
 (3, 'federal'): 0.838980640259944,
 (3, 'for'): 0.484828972252917,
 (3, 'how'): 0.3320425491467144,
 (3, 'is'): 0.2446280201983818,
 (3, 'judges'): 0.995576011321865,
 (

**POS Tagging Dictionary**

In [17]:
from nltk.tag import pos_tag_sents
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:

pos2index = {}
pos2index['NN'] = 0
index = 1

# Iterate through words in corpus and create POS tag lookup table
for word in word2index.keys():

    pos = pos_tag([word])[0][-1]

    if pos not in pos2index:
        pos2index[pos] = index
        index += 1


In [19]:
pos2index

{'NN': 0,
 'NNS': 1,
 'RB': 2,
 'JJ': 3,
 'VBN': 4,
 'CD': 5,
 'IN': 6,
 'VBG': 7,
 'VB': 8,
 'VBD': 9,
 'PRP': 10,
 'LS': 11,
 'JJS': 12,
 'DT': 13,
 'CC': 14,
 'VBP': 15,
 'VBZ': 16,
 'FW': 17,
 'TO': 18,
 'WP': 19,
 'RBR': 20,
 'MD': 21,
 'WRB': 22,
 'JJR': 23,
 'PRP$': 24,
 'WDT': 25,
 'WP$': 26}

In [20]:
tensor = torch.tensor([0,0])
print(tensor.unsqueeze(-1))

tensor([[0],
        [0]])


In [21]:
total = 0
for sent in pos_tag_sents(train_questions):
    total += len(sent)

print(total)
pos_tag_sents(train_questions)

14725


[[('how', 'WRB'),
  ('are', 'VBP'),
  ('glacier', 'JJ'),
  ('caves', 'NNS'),
  ('formed', 'VBD')],
 [('how', 'WRB'),
  ('are', 'VBP'),
  ('the', 'DT'),
  ('directions', 'NNS'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('velocity', 'NN'),
  ('and', 'CC'),
  ('force', 'NN'),
  ('vectors', 'NNS'),
  ('related', 'VBN'),
  ('in', 'IN'),
  ('a', 'DT'),
  ('circular', 'JJ'),
  ('motion', 'NN')],
 [('how', 'WRB'),
  ('did', 'VBD'),
  ('apollo', 'VB'),
  ('creed', 'NN'),
  ('die', 'NN')],
 [('how', 'WRB'),
  ('long', 'JJ'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('term', 'NN'),
  ('for', 'IN'),
  ('federal', 'JJ'),
  ('judges', 'NNS')],
 [('how', 'WRB'),
  ('a', 'DT'),
  ('beretta', 'NN'),
  ('model', 'NN'),
  ('21', 'CD'),
  ('pistols', 'NNS'),
  ('magazines', 'NNS'),
  ('works', 'NNS')],
 [('how', 'WRB'), ('a', 'DT'), ('vul', 'NN'), ('works', 'NN')],
 [('how', 'WRB'),
  ('an', 'DT'),
  ('outdoor', 'JJ'),
  ('wood', 'NN'),
  ('boiler', 'NN'),
  ('works', 'NNS')],
 [('how', 'WRB'),
  ('big', 'JJ'),
  ('

In [22]:
from nltk.tag import pos_tag
pos_tag([''])[0]

('', 'NN')

**Named Entity Recognition**

In [23]:
# ! python -m spacy validate
import spacy
nlp = spacy.load("en_core_web_sm")

# article = nlp(list(train_data['Question']))
# print(article)

In [24]:
nlp('conceptual')[0].ent_type_

''

In [25]:
ner2index = {}
index = 0

# Iterate through words in corpus and create NER tag lookup table
for word in word2index.keys():

    ner = nlp(word)[0].ent_type_

    if ner not in ner2index:
        ner2index[ner] = index
        index += 1

In [26]:
ner2index

{'': 0,
 'ORG': 1,
 'PERSON': 2,
 'TIME': 3,
 'CARDINAL': 4,
 'DATE': 5,
 'GPE': 6,
 'NORP': 7,
 'LANGUAGE': 8,
 'FAC': 9,
 'ORDINAL': 10,
 'LAW': 11,
 'PRODUCT': 12,
 'LOC': 13,
 'QUANTITY': 14,
 'EVENT': 15,
 'WORK_OF_ART': 16}

In [27]:
def ner_tag_sents(data):

    docs = []
    total = 0
    total_2 = 0

    for sentence in data:
        text = " ".join(sentence)
        doc = nlp(text)
        docs.append(doc)

    # for doc in docs:
    #     for entity in doc.ents:
    #         print(entity.text, entity.label_)

    # print(docs)

    # for doc in docs:
    #     print([(x, x.ent_iob_, x.ent_type_) for x in doc])
    #     total += 1
    #     print(total)
    #     total_2 += len(doc)
    #     print(total_2)


ner_tag_sents(train_questions)
# print(len(ner_tags))
# for sent in ner_tags:
#     print(sent)
#     total +=1
#     print(total)

**GloVe/Word2VeC word embedding**

In [28]:
import gensim.downloader as api

# model = api.load("word2vec-google-news-300")
embed_model = api.load("glove-wiki-gigaword-50")

In [29]:
def create_word_embedding(model):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    '''
    weights_matrix = np.zeros((len(word2index)+2, 50))
    words_found = 0
    for i, word in enumerate(word2index.keys()):
        print(i+2, word)
        try:
            weights_matrix[i+2] = model[word]
            words_found += 1
        except:
            pass
        
    return weights_matrix, words_found

In [35]:
def create_input_embedding(embed_model, pos=False, ner=False):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    Also includes embeddings for NER and POS tags.
    '''
    word_dim = embed_model.vector_size
    print(pos, ner)

    if ner:
        ner_dim = len(ner2index)
    else:
        ner_dim = 0

    if pos:
        pos_dim = len(pos2index)
    else:
        pos_dim = 0

    weights_matrix = np.zeros((len(word2index) + 2, word_dim + ner_dim + pos_dim))

    words_found = 0
    for i, word in enumerate(word2index.keys()):
        
        # print(model[word])
        # print(nlp(word)[0].ent_type_)
        # print(pos_tag([word])[0][-1])
        # try:

        if word in embed_model.key_to_index:
            input_embedding = embed_model[word]
            
            if ner:
                ner_tag_embedding = np.zeros(ner_dim)
                if nlp(word)[0].ent_type_ in ner2index:
                    ner_tag_embedding[ner2index[nlp(word)[0].ent_type_]] = 1.0
                # input_embedding = np.concatenate(input_embedding, ner_tag_embedding)
                input_embedding = np.hstack((input_embedding, ner_tag_embedding))

            if pos:
                pos_tag_embedding = np.zeros(pos_dim)
                if pos_tag([word])[0][-1] in pos2index:
                    pos_tag_embedding[pos2index[pos_tag([word])[0][-1]]] = 1.0
                input_embedding = np.hstack((input_embedding, pos_tag_embedding))

            # combined_embedding = np.concatenate((word_embedding, ner_tag_embedding, pos_tag_embedding))
            # print(combined_embedding)
            weights_matrix[i + 2] = input_embedding
            words_found += 1
        # except:
        #     pass

    return weights_matrix, words_found

In [31]:
embed_model['alkalitreated']

KeyError: "Key 'alkalitreated' not present"

In [32]:
embed_model.key_to_index

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

In [33]:
def create_input_embedding(embed_model, features:list):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    '''
    weights_matrix = []
    words_found = 0

    input_embedding = []

    input_embedding.append(np.zeros(50))

    if 'pos' in features:
        input_embedding.append('NN')

    if 'ner' in features:
        input_embedding.append('')

    weights_matrix.append(input_embedding)

    for i, word in enumerate(word2index.keys()):

        input_embedding = []

        print(i+2, word)
        if word in embed_model.key_to_index:
            input_embedding.append(embed_model[word])
            words_found += 1
        else:
            input_embedding.append(np.zeros(50))

        if 'pos' in features:
            input_embedding.append(pos_tag([word])[0][-1])

        if 'ner' in features:
            input_embedding.append(nlp(word)[0].ent_type_)

        # print(len(inp))
        weights_matrix.append(input_embedding)
        
    return weights_matrix, words_found

In [36]:
weights_matrix, words_found = create_input_embedding(embed_model, pos = True, ner = False)
# weights_matrix, words_found = create_input_embedding(embed_model, ['pos', 'ner'])
# weights_matrix, words_found = create_word_embedding(embed_model)

True False


In [37]:
words_found

33204

In [38]:
word2index['the']

29900

In [39]:
if 'yes' in embed_model.key_to_index:
    print('yes')

yes


In [40]:
embed_model['the']

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [41]:
print(weights_matrix.shape)

(39552, 77)


In [42]:
print(weights_matrix[31499])

[-0.72614998  0.10151    -0.19085    -0.79188001  0.20247     0.45497999
  0.052936   -1.63709998  0.51247001 -1.38759995 -0.60711998  1.26730001
 -0.198      -1.07550001 -0.37321001  0.55396003  0.047274    0.76911002
  0.34404001 -0.51498002  0.50577003  0.68787998  0.058905   -1.21309996
 -0.04167     0.30160999  0.34602001  0.72343999 -0.55550998 -0.21025001
 -1.07219994  0.2705      0.84670001  0.25354999 -0.36588001  0.08543
 -0.51634997 -0.52293998  1.24510002 -0.90722001 -0.054584   -0.18237001
  0.15669    -0.02888     0.42333999  0.32572001  0.20883    -0.92233002
 -0.57977003  0.067128    0.          1.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.        ]


In [43]:
np.save('weights.matrix', weights_matrix)

In [44]:
len(word2index)

39550

**Concatenate Word Embeddings and Features**

In [45]:
def concat_inputs(data):

    embeddings = []

    for index, sentence in enumerate(data):
        sent_embedding = []

        # Iterate through tokens
        for token in sentence:
            token_embedding = []

            # If the word is in the word embedding model's vocabulary then append its embedding,
            # otherwise an OOV vector of [0.0] * embedding dimension is appended (for simplicity)
            if token in embed_model.key_to_index:
                token_embedding.append([embed_model[token]])
            else:
                token_embedding.append(np.zeros(50))

            # Look up and append tf_idf score to the embedding using the question number and word as the key 
            token_embedding.append(tf_idfs[(index, token)])

            # Append the pos tag to the input embedding
            token_embedding.append(pos_tag([token])[0][-1])

            # Append the token embedding to the sentence embedding
            sent_embedding.append(token_embedding)

        # Append the sentence embedding to the list of all embeddings
        embeddings.append(sent_embedding)
    
    return np.array(embeddings, dtype=object)

In [46]:
train_qn_inputs = concat_inputs(train_questions)
print(train_qn_inputs[0][0])

[[array([ 6.8938e-01, -1.0644e-01,  1.7083e-01, -3.7583e-01,  7.5170e-01,
        7.8149e-04, -5.3102e-01, -1.9903e-01, -1.4419e-01,  1.2748e-01,
       -2.8038e-01,  7.0723e-01, -5.4100e-01,  1.9625e-01,  9.6635e-01,
        6.0519e-01,  4.0918e-01, -3.1612e-02,  5.3900e-01, -8.7086e-01,
       -2.0912e-01,  5.6853e-01,  6.5983e-01,  1.4583e-01,  1.0112e+00,
       -2.0736e+00, -1.1242e+00,  5.9662e-04,  7.0332e-01, -8.2608e-01,
        3.4445e+00,  3.2984e-01, -3.5324e-01, -1.0335e+00, -1.4753e-01,
       -1.4874e-01, -4.1246e-01,  3.3489e-01,  1.9841e-01, -2.5478e-01,
       -4.7193e-01,  6.6701e-02,  3.2777e-01,  6.8781e-01,  3.6428e-01,
        2.1522e-01,  1.6494e-01,  4.1761e-01, -2.2504e-01,  6.1412e-01],
      dtype=float32)], 0.531268078634743, 'WRB']


In [None]:
train_doc_inputs = concat_inputs(train_documents)
print(train_doc_inputs[0])

### 2.1 - Layers

**Feature Embedding layer**

In [101]:
class AlignQuestionEmbedding(nn.Module):
    
    def __init__(self, input_dim):        
        
        super().__init__()
        
        self.linear = nn.Linear(input_dim, input_dim)
        
        self.relu = nn.ReLU()
        
    def forward(self, context, question, question_mask):
        
        # context = [bs, ctx_len, emb_dim]
        # question = [bs, qtn_len, emb_dim]
        # question_mask = [bs, qtn_len]
    
        ctx_ = self.linear(context)
        ctx_ = self.relu(ctx_)
        # ctx_ = [bs, ctx_len, emb_dim]
        
        qtn_ = self.linear(question)
        qtn_ = self.relu(qtn_)
        # qtn_ = [bs, qtn_len, emb_dim]
        
        qtn_transpose = qtn_.permute(0,2,1)
        # qtn_transpose = [bs, emb_dim, qtn_len]
        
        align_scores = torch.bmm(ctx_, qtn_transpose)
        # align_scores = [bs, ctx_len, qtn_len]
        
        qtn_mask = question_mask.unsqueeze(1).expand(align_scores.size())
        # qtn_mask = [bs, 1, qtn_len] => [bs, ctx_len, qtn_len]
        
        # Fills elements of self tensor(align_scores) with value(-float(inf)) where mask is True. 
        # The shape of mask must be broadcastable with the shape of the underlying tensor.
        align_scores = align_scores.masked_fill(qtn_mask == 1, -float('inf'))
        # align_scores = [bs, ctx_len, qtn_len]
        
        align_scores_flat = align_scores.view(-1, question.size(1))
        # align_scores = [bs*ctx_len, qtn_len]
        
        alpha = F.softmax(align_scores_flat, dim=1)
        alpha = alpha.view(-1, context.shape[1], question.shape[1])
        # alpha = [bs, ctx_len, qtn_len]
        
        align_embedding = torch.bmm(alpha, question)
        # align = [bs, ctx_len, emb_dim]
        
        return align_embedding

**Bi-LSTM Layer**

In [102]:
class StackedBiLSTM(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        
        super().__init__()
        
        self.dropout = dropout
        
        self.num_layers = num_layers
        
        self.lstms = nn.ModuleList()
        
        for i in range(self.num_layers):
            
            if i == 0:
                input_dim = input_dim 
            else:
                input_dim = hidden_dim * 2
            
            self.lstms.append(nn.LSTM(input_dim, hidden_dim,
                                      batch_first=True, bidirectional=True))
           
    
    def forward(self, x):
        # x = [bs, seq_len, feature_dim]

        outputs = [x]
        for i in range(self.num_layers):

            lstm_input = outputs[-1]
            lstm_out = F.dropout(lstm_input, p=self.dropout)
            lstm_out, (hidden, cell) = self.lstms[i](lstm_input)
           
            outputs.append(lstm_out)

    
        output = torch.cat(outputs[1:], dim=2)
        # [bs, seq_len, num_layers*num_dir*hidden_dim]
        
        output = F.dropout(output, p=self.dropout)
      
        return output

**Linear Attention Layer**

In [103]:
class LinearAttentionLayer(nn.Module):
    
    def __init__(self, input_dim):
        
        super().__init__()
        
        self.linear = nn.Linear(input_dim, 1)
        
    def forward(self, question, question_mask):
        
        # question = [bs, qtn_len, input_dim] = [bs, qtn_len, bi_lstm_hid_dim]
        # question_mask = [bs,  qtn_len]
        
        qtn = question.view(-1, question.shape[-1])
        # qtn = [bs*qtn_len, hid_dim]
        
        attn_scores = self.linear(qtn)
        # attn_scores = [bs*qtn_len, 1]
        
        attn_scores = attn_scores.view(question.shape[0], question.shape[1])
        # attn_scores = [bs, qtn_len]
        
        attn_scores = attn_scores.masked_fill(question_mask == 1, -float('inf'))
        
        alpha = F.softmax(attn_scores, dim=1)
        # alpha = [bs, qtn_len]
        
        return alpha
        

def weighted_average(x, weights):
    # x = [bs, len, dim]
    # weights = [bs, len]
    
    weights = weights.unsqueeze(1)
    # weights = [bs, 1, len]
    
    w = weights.bmm(x).squeeze(1)
    # w = [bs, 1, dim] => [bs, dim]
    
    return w

**Attention Layer**

In [190]:
class AttentionLayer(nn.Module):
    
    def __init__(self, context_dim, question_dim, attn_method):
        
        super().__init__()
        
        self.linear = nn.Linear(question_dim, context_dim)

        self.attn_method = attn_method
        
    def forward(self, context, question, context_mask):
        
        # context = [bs, ctx_len, ctx_hid_dim] = [bs, ctx_len, hid_dim*6] = [bs, ctx_len, 768]
        # question = [bs, qtn_hid_dim] = [bs, qtn_len, 768]
        # context_mask = [bs, ctx_len]
        
        # print(self.attn_method)

        if self.attn_method == 'bilinear':
            qtn_proj = self.linear(question)
            qtn_proj = qtn_proj.unsqueeze(2)
            # scores = context.bmm(qtn_proj)
            scores = context.bmm(qtn_proj)
        # qtn_proj = [bs, ctx_hid_dim]
        elif self.attn_method == 'cosine sim':
            qtn_norm = F.normalize(question, dim=-1)  # Normalize the question vectors
            ctx_norm = F.normalize(context, dim=-1)  # Normalize the context vectors
            scores = ctx_norm.bmm(qtn_norm.unsqueeze(2))
        elif self.attn_method == 'scaled dot product':
        # qtn_proj = [bs, ctx_hid_dim, 1]
            scores = context.bmm(question.unsqueeze(2))
        # scores = [bs, ctx_len, 1]
            d_k = question.size(-1)  # Dimension of the query/key vectors
            scores = scores / np.sqrt(d_k)  # Apply scaling by square root of the dimension
        
        scores = scores.squeeze(2)
        # scores = [bs, ctx_len]
        
        scores = scores.masked_fill(context_mask == 1, -float('inf'))
        
        # alpha = nn.LogSoftmax(scores, dim=1)
        # alpha = [bs, ctx_len]
        
        return scores

### 2.2 - Model

In [156]:
class QA_model(nn.Module):
    
    def __init__(self, hidden_dim, embedding_dim, num_layers, num_directions, dropout, device, attn_method):
        
        super().__init__()
        
        self.device = device
        
        #self.embedding = self.get_glove_embedding()
        
        self.context_bilstm = StackedBiLSTM(embedding_dim * 2, hidden_dim, num_layers, dropout)
        
        self.question_bilstm = StackedBiLSTM(embedding_dim, hidden_dim, num_layers, dropout)
        
        self.glove_embedding = self.get_glove_embedding()
        
        def tune_embedding(grad, words=1000):
            grad[words:] = 0
            return grad
        
        self.glove_embedding.weight.register_hook(tune_embedding)
        
        self.align_embedding = AlignQuestionEmbedding(embedding_dim)
        
        self.linear_attn_question = LinearAttentionLayer(hidden_dim*num_layers*num_directions) 

        self.attn_start = AttentionLayer(hidden_dim*num_layers*num_directions, 
                                                          hidden_dim*num_layers*num_directions, attn_method)
        
        self.attn_end = AttentionLayer(hidden_dim*num_layers*num_directions,
                                                        hidden_dim*num_layers*num_directions, attn_method)
        
        self.dropout = nn.Dropout(dropout)

        
    def get_glove_embedding(self):
        
        weights_matrix = np.load('weights.matrix.npy')
        num_embeddings, embedding_dim = weights_matrix.shape
        print(num_embeddings, embedding_dim)
        embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix).to(self.device),freeze=False)

        return embedding
    
    
    def forward(self, context, question, context_mask, question_mask):
       
        # context = [bs, len_c]
        # question = [bs, len_q]
        # context_mask = [bs, len_c]
        # question_mask = [bs, len_q]
        
        
        ctx_embed = self.glove_embedding(context)
        # ctx_embed = [bs, len_c, emb_dim]
        
        ques_embed = self.glove_embedding(question)
        # ques_embed = [bs, len_q, emb_dim]
        

        ctx_embed = self.dropout(ctx_embed)
     
        ques_embed = self.dropout(ques_embed)
             
        align_embed = self.align_embedding(ctx_embed, ques_embed, question_mask)
        # align_embed = [bs, len_c, emb_dim]  
        
        ctx_bilstm_input = torch.cat([ctx_embed, align_embed], dim=2)
        # ctx_bilstm_input = [bs, len_c, emb_dim*2]
                
        ctx_outputs = self.context_bilstm(ctx_bilstm_input)
        # ctx_outputs = [bs, len_c, hid_dim*layers*dir] = [bs, len_c, hid_dim*6]
       
        qtn_outputs = self.question_bilstm(ques_embed)
        # qtn_outputs = [bs, len_q, hid_dim*6]
    
        qtn_weights = self.linear_attn_question(qtn_outputs, question_mask)
        # qtn_weights = [bs, len_q]
            
        qtn_weighted = weighted_average(qtn_outputs, qtn_weights)
        # qtn_weighted = [bs, hid_dim*6]
        
        start_scores = self.attn_start(ctx_outputs, qtn_weighted, context_mask)
        # start_scores = [bs, len_c]
         
        end_scores = self.attn_end(ctx_outputs, qtn_weighted, context_mask)
        # end_scores = [bs, len_c]
        
      
        return start_scores, end_scores

**Training**

In [197]:
device = torch.device("cpu")
print(device)
HIDDEN_DIM = 4
EMB_DIM = 77
NUM_LAYERS = 3
NUM_DIRECTIONS = 2
DROPOUT = 0 # vastly different predictions are made each test if dropout is used
attn_method = 'cosine sim'

model = QA_model(HIDDEN_DIM,
                       EMB_DIM, 
                       NUM_LAYERS, 
                       NUM_DIRECTIONS, 
                       DROPOUT, 
                       device,
                       attn_method).to(device)

cpu
39552 77


In [107]:
optimizer = torch.optim.Adamax(model.parameters())

In [108]:
def count_parameters(model):
    '''Returns the number of trainable parameters in the model.'''
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,062,303 trainable parameters


In [109]:
def train(model, train_dataset):
    '''
    Trains the model.
    '''
    
    print("Starting training ........")
    
    train_loss = 0.
    batch_count = 0
    
    # put the model in training mode
    model.train()
    
    # iterate through training data
    for batch in train_dataset:

        print(batch_count)

        if batch_count % 500 == 0:
            print(f"Starting batch: {batch_count}")
        batch_count += 1

        context, question, context_mask, question_mask, answer_span, ctx, ids = batch
        # context, question, context_mask, question_mask, label, ctx, ans, ids = batch
        # print(context)
        # print(question)
        # print(context_mask)
        # print(question_mask)
        # print(answer_span)
        
        # place the tensors on GPU
        context, context_mask, question, question_mask, answer_span = context.to(device), context_mask.to(device),\
                                    question.to(device), question_mask.to(device), answer_span.to(device)
        
        # forward pass, get the predictions
        preds = model(context, question, context_mask, question_mask)

        p1, p2 = preds
        # print(p1,p2)
        
        # separate labels for start and end position
        y1, y2 = answer_span[:,0], answer_span[:,1]
        # print(answer_span, ids)
        # print(y1,y2)
        # print(y1, y2)
        # print(p2,y2)
        # for i in range(len(p1[0])):
        #     print(i)
        #     print(max(p1[i]))
        #     print(p1[y1[i]])
        #     break

        # calculate loss
        loss = F.cross_entropy(p1, y1) + F.cross_entropy(p2, y2)
        
        # backward pass, calculates the gradients
        loss.backward()
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        
        # update the gradients
        optimizer.step()
        
        # zero the gradients to prevent them from accumulating
        optimizer.zero_grad()

        train_loss += loss.item()
    

    return train_loss/len(train_dataset)

**Validation**

In [187]:
def valid(model, valid_dataset):
    '''
    Performs validation.
    '''
    
    print("Starting validation .........")
   
    valid_loss = 0.

    batch_count = 0
    
    f1, em = 0., 0.
    
    # puts the model in eval mode. Turns off dropout
    model.eval()
    
    predictions = {}
    
    for batch in valid_dataset:

        if batch_count % 500 == 0:
            print(f"Starting batch {batch_count}")
        batch_count += 1

        context, question, context_mask, question_mask, answer_span, context_text, ids = batch
        # context, question, context_mask, question_mask, label, context_text, answers, ids = batch

        context, context_mask, question, question_mask, answer_span = context.to(device), context_mask.to(device),\
                                    question.to(device), question_mask.to(device), answer_span.to(device)

        with torch.no_grad():

            preds = model(context, question, context_mask, question_mask)

            p1, p2 = preds

            y1, y2 = answer_span[:,0], answer_span[:,1]

            # cross_entropy function combines the softmax operation with the computation of the cross-entropy loss in a single step
            loss = F.cross_entropy(p1, y1) + F.cross_entropy(p2, y2)

            valid_loss += loss.item()

            
            # get the start and end index positions from the model preds
            
            batch_size, c_len = p1.size()
            ls = nn.LogSoftmax(dim=1)
            # mask = torch.ones(c_len, c_len).tril(-1).to(device)  # Lower triangular matrix
            # mask = mask.fill_(float('-inf'))  # Set all elements to negative infinity
            # mask = mask.unsqueeze(0).expand(batch_size, -1, -1)  # Expand mask for each sample in the batch
            mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1).unsqueeze(0).expand(batch_size, -1, -1)
            
            # score = (p1.unsqueeze(2) + p2.unsqueeze(1)) + mask
            score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask
            score, s_idx = score.max(dim=1)
            score, e_idx = score.max(dim=1)
            # s_scores, s_idx = score.max(dim=1)
            # e_scores, e_idx = score.topk(k=1, dim=1)  # Retrieve the top 1 scores and indices along the second dimension
            # e_idx = e_idx.squeeze(1)
            print(s_idx, e_idx)
            s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze()
            # e_idx = torch.gather(e_idx, 1, s_idx.view(-1, 1)).squeeze()
            print(s_idx, e_idx)
            # stack predictions
            for i in range(batch_size):
                id = ids[i]
                # print(id, [p1, p2])
                # pred = context[i][s_idx[i]:e_idx[i]+1]

                # pred = ' '.join([index2word[idx.item()] for idx in pred])
                predictions[id] = [s_idx[i].item(), e_idx[i].item()]
                # predictions[id] = [p1, p2]
            
    print(predictions)
    # em, f1 = evaluate(predictions)            
    # return valid_loss/len(valid_dataset), em, f1
                

**Evaluation**

In [111]:
def normalize_answer(s):
    '''
    Performs a series of cleaning steps on the ground truth and 
    predicted answer.
    '''
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    '''
    Returns maximum value of metrics for predicition by model against
    multiple ground truths.
    
    :param func metric_fn: can be 'exact_match_score' or 'f1_score'
    :param str prediction: predicted answer span by the model
    :param list ground_truths: list of ground truths against which
                               metrics are calculated. Maximum values of 
                               metrics are chosen.
                            
    
    '''
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
        
    return max(scores_for_ground_truths)


def f1_score(prediction, ground_truth):
    '''
    Returns f1 score of two strings.
    '''
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    '''
    Returns exact_match_score of two strings.
    '''
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def epoch_time(start_time, end_time):
    '''
    Helper function to record epoch time.
    '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def evaluate(predictions):
    '''
    Gets a dictionary of predictions with question_id as key
    and prediction as value. The validation dataset has multiple 
    answers for a single question. Hence we compare our prediction
    with all the answers and choose the one that gives us
    the maximum metric (em or f1). 
    This method first parses the JSON file, gets all the answers
    for a given id and then passes the list of answers and the 
    predictions to calculate em, f1.
    
    
    :param dict predictions
    Returns
    : exact_match: 1 if the prediction and ground truth 
      match exactly, 0 otherwise.
    : f1_score: 
    '''
    with open('./data/squad_dev.json','r',encoding='utf-8') as f:
        dataset = json.load(f)
        
    dataset = dataset['data']
    f1 = exact_match = total = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                if qa['id'] not in predictions:
                    continue
                
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                
                prediction = predictions[qa['id']]
                
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)
                
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    
    return exact_match, f1



# 3.Model Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [198]:

train_losses = []
valid_losses = []
ems = []
f1s = []
epochs = 5

for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    
    start_time = time.time()
    
    train_loss = train(model, train_loader)
    valid_loss, em, f1 = valid(model, test_loader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    ems.append(em)
    f1s.append(f1)
    
    print(f"Epoch train loss : {train_loss}| Time: {epoch_mins}m {epoch_secs}s")
    print(f"Epoch valid loss: {valid_loss}")
    print(f"Epoch EM: {em}")
    print(f"Epoch F1: {f1}")
    print("====================================================================================")
    

Epoch 1
Starting training ........
0
Starting batch: 0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
Starting validation .........
Starting batch 0
tensor([[0, 0, 2,  ..., 0, 0, 0],
        [0, 1, 2,  ..., 0, 0, 0],
        [0, 1, 2,  ..., 0, 0, 0],
        ...,
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0]]) tensor([127, 159,  87,  44, 308, 308, 308, 308, 279, 105,  32, 127, 103,  13,
         48, 124,  64,  26,  61, 255, 156, 261, 111, 111,  17, 158,   1,  32,
         34,  30,  35,  35])
tensor([127, 159,  87,  44, 308, 308, 308, 308, 279, 105,  32, 127, 103,  13,
         48, 124,  64,  26,  61, 255, 156, 261, 111, 111,  17, 158,   1,  32,
         34,  30,  35,  35]) tensor([127, 159,  87,  44, 308, 308, 308, 308, 279, 105,  32, 127, 103,  13,
         48, 124,  64,  26, 

TypeError: cannot unpack non-iterable NoneType object

In [None]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

### 3.1. Input Embedding Ablation Study

(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 



### 3.2. Attention Ablation Study
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

### 3.3. Hyper Parameter Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 