# 2023 CITS4012 Assignment
*Make sure you change the file name with your student id.*

# Readme
*If there is something to be noted for the marker, please mention here.* 

*If you are planning to implement a program with Object Oriented Programming style, please check the bottom of the this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [82]:
# Necessary Library Imports
import pandas as pd
import numpy as np
import nltk
import re
import torch
import time
import torch.nn.functional as F

from torch import nn
from nltk.tag import pos_tag
from nltk import word_tokenize

nltk.download('punkt')

# Read in data files
test_df = pd.read_csv("WikiQA-test.tsv", sep="\t")
train_df = pd.read_csv("WikiQA-train.tsv", sep="\t")

# TODO: ask about Word Match Feature: check whether the word appears in the question by using decapitalisation or lemmatization - how can we extract that for the question inputs
# TODO: ask about NER tagging 3g and 4g as (3, g) and (4, g)
# TODO: solve if word not in GloVe model

# notes: 

# spacy word tokenizer can be used for the case of tagging 3g and 4g as (3, g) and (4, g), however it is far less
# efficient - the data wrangling function takes about 210 seconds to run instead of 5 seconds when using spacy tokenize

# phrases like 1570–1638 get converted to 15701638 after punctuation removal 

# it would be better to predict which sentence is the answer to the question since that is how the data ia stored - sentences are tagged as the answer
# our model removes the concept of sentences by removing full stops and representing the whole document as a continuous string of tokens
# an improvement would be to keep the sentences and extract which sentence contains the answer

# improvement could be to remove articles (a, an, the)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Dictionary for Contraction Removal**

In [83]:
contraction_dict = {"ID": "identify", "Im": "i am", "im": "i am", "Dont": "do not", "dont": "do not", "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", 
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", 
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", 
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", 
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", 
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have",
                    "diff'rent": "different", "3g": "3 g", "4g": "4 g"}


def replace_contractions(sentence):
    words = sentence.split()
    replaced_words = []
    for word in words:
        if word in contraction_dict:
            replaced_words.append(contraction_dict[word])
        else:
            replaced_words.append(word)
    replaced_sentence = " ".join(replaced_words).lower()
    return replaced_sentence


def clean(string):
    '''
    Removes puncutation and replaces contractions from a string

    string: string to clean

    return: cleaned string
    '''
    clean_string = re.sub(r'[^\w\s\']', '', string)
    clean_string = replace_contractions(clean_string)
    clean_string = re.sub(r'[^\w\s]', '', string)
    return clean_string

**Vocab Handling**

In [84]:
def create_vocab_list(dfs:list):
    '''
    Creates index2word and word2index dictionaries for encoding from inputted dataframes. Vocab list 
    contains words from dataframe sentences and questions.
    
    dfs: list of dataframes from dataset.

    returns: word2index and index2word dictionaries
    '''
    
    words = set(['[PAD]', '[OOV]'])

    for df in dfs:
        unique_sentences = list(df['Sentence'].unique())
        for sentence in unique_sentences:
            tokens = word_tokenize(clean(sentence.lower()))
            for token in tokens:
                words.add(token)

        unique_questions = list(df['Question'].unique())
        for question in unique_questions:
            tokens = word_tokenize(clean(question.lower()))
            for token in tokens:
                words.add(token)

    word2index = {w: i+2 for i, w in enumerate(words)}
    word2index['[PAD]'] = 1  
    word2index['[OOV]'] = 0  

    index2word = {index:word for word,index in word2index.items()}

    return word2index, index2word

word2index, index2word = create_vocab_list([train_df, test_df])

**Input Encoding**

In [85]:
def encode(tokens, word2index):
    '''
    Converts tokens into respective idsby mapping ach words using word2index. 

    tokens: list of tokens
    word2index: word to index mapping

    returns: list of mapped indexes
    '''
    
    ids = [word2index[word] for word in tokens]

    return ids

**Data Wrangling Function**

In [86]:
# Read in data files
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")

def wrangle_data(df):
    """Handles data and splits dataframe into question, document and answer set

    df: dataframe to split

    return: question, document, and answer dataframe
    """

    # Initialise variables containing all Questions, Documents and Answers
    questions = []
    documents = []
    answers = {}

    # Get unique questions from the dataframe
    unique_questions = df['Question'].drop_duplicates()

    # Iterate through questions dataframe to form the Questions data
    for row in unique_questions:
        
        # Remove punctuation (except apostrophes for contraction removal)
        cleaned_row = re.sub(r'[^\w\s\']','', row)

        # Replace contracted words
        cleaned_row = replace_contractions(cleaned_row)

        # Tokenise the question
        question_tokens = word_tokenize(cleaned_row.lower())

        # Append the question to the list of questions
        questions.append(question_tokens)

    # Iterate through dataframe and form Documents and Answer data
    for index, row in df.iterrows():

        # Initalise current document pairs list and previous doc number (only for first iteration)
        if index == 0:
            previous_doc_num = row['DocumentID']
            current_doc_pairs = []

        # If the current doc number is different to the previous doc number, then a new  
        # document has been found so append the current document pairs to the list of documents     
        # and reinitilise the current document pairs list
        current_doc_num = row['DocumentID']
        if current_doc_num != previous_doc_num:
            documents.append(current_doc_pairs)
            current_doc_pairs = []
        
        # Replace contracted words
        cleaned_row = replace_contractions(row['Sentence'])

        # Remove punctuation
        cleaned_row = re.sub(r'[^\w\s]','', cleaned_row)

        # Tokenise the current row's sentence
        sent_tokens = word_tokenize(cleaned_row.lower())

        # Assign token types according to 'Label' field (1 means the corresponding sentence
        # is the answer/one of the answers)
        # Token types:
        #   0: token a part of the answer
        #   1: start token of the answer
        #   2: inner token of the answer
        #   3: end token of the answer
        if row['Label']:
            # Answer to question is found
            token_types = np.full(shape=len(sent_tokens), fill_value=2)
            token_types[0] = 1
            token_types[-1] = 3

            # Add the tokenised answer to the answers dictionary. If an answer has already been found.
            # add the tokens to the current answer.
            if row['QuestionID'] in answers:
                answers[row['QuestionID']] += sent_tokens 
            else:
                answers[row['QuestionID']] = sent_tokens
        else:
            # Answer not found 
            token_types = np.zeros(len(sent_tokens), dtype=int)

        # Combine token and corresponding types and append to current document list
        token_type_pairs = list(zip(sent_tokens, token_types))
        current_doc_pairs.append(token_type_pairs)

        # Update previous document number 
        previous_doc_num = current_doc_num

        # If the iterated row is the final row in the dataframe, append the current document
        # to the documents array
        if index == len(df)-1:
            documents.append(current_doc_pairs)

    return questions, documents, answers

train_questions, train_documents, train_answers = wrangle_data(train_data)
test_questions, test_documents, test_answers = wrangle_data(test_data)


In [87]:
def wrangle_data(df):
    """
    Handles data wrangling and returns a dataframe giving each question the corresponding document and
    answer span set. Encodes the words into their index of the vocab list as well.

    Args:
        df (pandas dataframe): dataframe to wrangle

    Returns: 
        dataframe containing questions with the corresponding document and answer span set (encoded).
    """

    # Initialise variable containing Answer Spans (questions can have 0, 1, or multiple answers)
    answer_spans = [[0,0]]

    # Intialise df object that will be returned after wrangling
    wrangled_df = pd.DataFrame(columns=['question_id', 'question', 'document', 'answer span'])


    # Iterate through dataframe and form Documents and Answer Span data
    for index, row in df.iterrows():
        # print(index)

        # Initalise current document tokens list, length and previous doc number (only for first iteration)
        if index == 0:
            previous_doc_num = row['DocumentID']
            current_doc_length = 0
            current_doc_tokens = []


        # If the current doc number is different to the previous doc number, then a new  
        # document has been found 
        current_doc_num = row['DocumentID']
        if current_doc_num != previous_doc_num:

            # Remove punctuation from question and replace contractions
            cleaned_qn = clean(previous_qn)

            # Tokenize the question 
            qn_tokens = word_tokenize(cleaned_qn.lower())
                
            for span in answer_spans:
                # Create a new row with multiple values
                # print(current_doc_tokens)
                new_row = {'question_id': previous_qn_id, 
                           'question': [encode(qn_tokens, word2index)],
                           'document':[encode(current_doc_tokens, word2index)],
                           'answer span': [span]}
                
                # Convert the new row to a DataFrame
                new_row_df = pd.DataFrame(new_row)

                # Append the new row DataFrame to the original DataFrame
                wrangled_df = pd.concat([wrangled_df, new_row_df], ignore_index=True)

            # Reinitialise document tokens list, current document length, and answer spans
            current_doc_tokens = []
            current_doc_length = 0
            answer_spans = [[0,0]]


        # Remove punctuation from document sentence and replace contracted words from document sentence 
        cleaned_sent = clean(row['Sentence'])

        # Tokenise the current row's sentence and append to the document tokens list
        sent_tokens = word_tokenize(cleaned_sent.lower())
        current_doc_tokens.extend(sent_tokens)


        if row['Label']:
            # Answer to question is found

            # Find the span of the answer
            span_start = current_doc_length
            span_end = span_start + len(sent_tokens)

            # Add answer span to current document's answer spans
            if answer_spans[0] == [0,0]:
                answer_spans[0] = [span_start, span_end]
            else:
                answer_spans.append([span_start, span_end])
            
            # Increase length of current document
            current_doc_length += len(sent_tokens)

        else:
            # Answer not found 

            # Increase length of current document
            current_doc_length += len(sent_tokens)

        # Update previous document number and question
        previous_doc_num = current_doc_num
        previous_qn_id = row['QuestionID']
        previous_qn = row['Question']

        # If the iterated row is the final row in the dataframe, add the final question and 
        # document to the wrangled df
        if index == len(df)-1:

            # Remove punctuation from question and replaces contractions
            cleaned_qn = clean(previous_qn)

            # Tokenize the question 
            qn_tokens = word_tokenize(cleaned_qn.lower())

            for span in answer_spans:
                # Create a new row with multiple values
                new_row = {'question_id': previous_qn_id,
                           'question': [encode(qn_tokens, word2index)],
                           'document': [encode(current_doc_tokens, word2index)],
                           'answer span': [span]}

                # Convert the new row to a DataFrame
                new_row_df = pd.DataFrame(new_row)

                # Append the new row DataFrame to the original DataFrame
                wrangled_df = pd.concat([wrangled_df, new_row_df], ignore_index=True)

    return wrangled_df

wrangled_train_data = wrangle_data(train_df)
wrangled_test_data = wrangle_data(test_df)


In [157]:
# Have a look at the wrangled data
wrangled_train_data

Unnamed: 0,question_id,question,document,answer span
0,Q1,"[12365, 15581, 18831, 36162, 14077]","[5820, 17206, 17579, 18831, 33492, 6930, 6430,...","[24, 37]"
1,Q2,"[12365, 15581, 36119, 36206, 10682, 36119, 231...","[13851, 22737, 21061, 17360, 14472, 5820, 1866...","[0, 0]"
2,Q5,"[12365, 38089, 11551, 1859, 3856]","[11551, 1859, 14472, 5820, 20933, 17017, 36213...","[0, 0]"
3,Q6,"[12365, 32302, 14472, 36119, 5478, 20328, 1640...","[13851, 36119, 10231, 7959, 36119, 11658, 1068...","[0, 0]"
4,Q7,"[12365, 5820, 14435, 25246, 4683, 39107, 19458...","[36119, 14435, 30015, 17267, 14472, 5820, 3245...","[0, 0]"
...,...,...,...,...
2279,Q3042,"[39440, 35936, 21824, 35709, 1739]","[21824, 1529, 28091, 21824, 35709, 1529, 14472...","[76, 94]"
2280,Q3043,"[38123, 14472, 32742, 10182, 30157]","[32742, 17732, 30157, 13851, 36119, 17278, 248...","[7, 42]"
2281,Q3043,"[38123, 14472, 32742, 10182, 30157]","[32742, 17732, 30157, 13851, 36119, 17278, 248...","[42, 70]"
2282,Q3044,"[38123, 14472, 36119, 695, 9313, 10682, 25140]","[37906, 380, 32898, 9313, 26246, 6488, 3431, 1...","[0, 0]"


**Data Loader**

A class for loading the during training and testing is created

In [91]:
import spacy
nlp = spacy.load("en_core_web_sm")

class DataLoader:
    '''
    -Divides the dataframe in batches.
    -Pads the contexts and questions dynamically for each batch by padding 
     the examples to the maximum-length sequence in that batch.
    -Calculates masks for context and question.
    '''
    
    def __init__(self, data, batch_size):
        
        self.batch_size = batch_size
        data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __iter__(self):
        '''
        Creates batches of data and yields them.
        
        Each yield comprises of:
        :padded_context: padded tensor of contexts for each batch 
        :padded_question: padded tensor of questions for each batch 
        :context_mask & question_mask: zero-mask for question and context
        :answer_span: start and end index wrt context_ids
        :context_text,answer_text: used while validation to calculate metrics
        :ids: question_ids used in evaluation
        '''
        
        for batch in self.data:

            context_text = []
            
            max_context_len = max([len(document) for document in batch['document']])
            padded_context = torch.LongTensor(len(batch), max_context_len).fill_(1)
            
            for ctx in batch['document']:
                context_text.append(ctx)
                
            for i, ctx in enumerate(batch['document']):
                padded_context[i, :len(ctx)] = torch.LongTensor(ctx)
            
            max_question_len = max([len(ques) for ques in batch['question']])
            padded_question = torch.LongTensor(len(batch), max_question_len).fill_(1)
            
            for i, ques in enumerate(batch['question']):
                padded_question[i,: len(ques)] = torch.LongTensor(ques)
                
            
            answer_span = torch.LongTensor(list(batch['answer span']))
            context_mask = torch.eq(padded_context, 1)
            question_mask = torch.eq(padded_question, 1)
            
            qn_ids = list(batch['question_id'])
             
            yield (padded_context, padded_question, context_mask, 
                   question_mask, answer_span, context_text, qn_ids)

In [150]:
train_loader = DataLoader(wrangled_train_data, 32)
test_loader = DataLoader(wrangled_test_data, 32)

In [96]:
import spacy
nlp = spacy.load("en_core_web_sm")

total = 0
for qn in train_questions:
    total += len(qn)
print(total)
train_questions

docs = []
total_2 = 0

for sentence in test_questions:
    text = " ".join(sentence)
    doc = nlp(text)
    docs.append(doc)

for i in range(len(test_questions)):
    # for x in doc:
    #     total_2 += 1
    #     print(total_2, x)
    if len(test_questions[i]) != len(docs[i]):
        print(test_questions[i], len(test_questions[i]))
        print([x for x in docs[i]], len(docs[i]))
    # total_2 += len(train_questions[i])
    # print(total_2)

14725


In [97]:
# for debugging - find all labels equal to 1
train_data.loc[train_data['Label'] == 1]

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
75,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-0,This tablespoon has a capacity of about 15 mL.,1
83,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-8,In the USA one tablespoon (measurement unit) i...,1
84,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-9,In Australia one tablespoon (measurement unit)...,1
98,Q17,how much are the harry potter movies worth,D17,Harry Potter,D17-13,The series also originated much tie-in merchan...,1
...,...,...,...,...,...,...,...
20292,Q3037,What is an economic feature?,D2802,Economics,D2802-9,"At the turn of the 21st century, the expanding...",1
20307,Q3039,what is the average american income,D1876,Household income in the United States,D1876-6,"U.S. median household income fell from $51,144...",1
20325,Q3042,When was Apple Computer founded,D2806,Apple Inc.,D2806-3,"The company was founded on April 1, 1976, and ...",1
20335,Q3043,what is section eight housing,D2807,Section 8 (housing),D2807-1,"Section 8 of the Housing Act of 1937 (), often...",1


# 2.QA Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

**TFIDF (Term Frequency Inverse Document Frequency)**

In [98]:
def calculate_df(tokenized_data):
    """ Calculate term frequencies for a given list of list of tokens

    tokenized_data: tokenized dataset

    return: dictionary of frequencies for each term in the dataset
    """
    DF = {}

    for token_vector in tokenized_data:
        # get each unique word in the doc and count the number of occurences in the document
        for term in np.unique(token_vector):
            try:
                DF[term] += 1
            except:
                DF[term] = 1

    # print scores in descending order
    sorted_dict = sorted(DF.items(), key = lambda x : x[1], reverse = True)
    #print(sorted_dict)
    
    return DF

In [None]:
# calculate TFIDF scores

from collections import Counter
import math

def calculate_tf_idf(tokenized_data):

    tf_idf = {}

    # total number of documents
    n = len(tokenized_data)

    # calculate Document Frequencies
    DF = calculate_df(tokenized_data)

    doc_id = 0
    # get each token vector
    for token_vector in tokenized_data:
        # initialise counter for the vector
        counter = Counter(token_vector)
        #calculate total number of words in the doc
        total_num_words = len(token_vector)

        # get each unique word in the doc
        for term in np.unique(token_vector):

            # calculate Term Frequency
            tf = counter[term]/total_num_words

            # calculate Document Frequency
            df = DF[term]

            # calculate Inverse Document Frequency
            idf = math.log(n/(df+1))+1

            # calcaulte TF-IDF
            tf_idf[doc_id, term] = tf*idf

        doc_id += 1

    return tf_idf

tf_idfs = calculate_tf_idf(train_questions)
tf_idfs


**POS Tagging Dictionary**

In [100]:
from nltk.tag import pos_tag_sents
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [101]:
pos2index = {}
pos2index['NN'] = 0
index = 1

# Iterate through words in corpus and create POS tag lookup table
for word in word2index.keys():

    pos = pos_tag([word])[0][-1]

    if pos not in pos2index:
        pos2index[pos] = index
        index += 1


In [None]:
pos2index

**Named Entity Recognition**

In [106]:
# ! python -m spacy validate
import spacy
nlp = spacy.load("en_core_web_sm")

In [108]:
ner2index = {}
index = 0

# Iterate through words in corpus and create NER tag lookup table
for word in word2index.keys():

    ner = nlp(word)[0].ent_type_

    if ner not in ner2index:
        ner2index[ner] = index
        index += 1

In [None]:
ner2index

**GloVe/Word2VeC word embedding**

In [111]:
import gensim.downloader as api

# model = api.load("word2vec-google-news-300")
embed_model = api.load("glove-wiki-gigaword-50")

In [119]:
def create_input_embedding(embed_model, pos=False, ner=False):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    Also includes embeddings for NER and POS tags.
    '''
    word_dim = embed_model.vector_size
    print(pos, ner)

    if ner:
        ner_dim = len(ner2index)
    else:
        ner_dim = 0

    if pos:
        pos_dim = len(pos2index)
    else:
        pos_dim = 0

    weights_matrix = np.zeros((len(word2index) + 2, word_dim + ner_dim + pos_dim))

    words_found = 0
    for i, word in enumerate(word2index.keys()):
        

        if word in embed_model.key_to_index:
            input_embedding = embed_model[word]
            
            if ner:
                ner_tag_embedding = np.zeros(ner_dim)
                if nlp(word)[0].ent_type_ in ner2index:
                    ner_tag_embedding[ner2index[nlp(word)[0].ent_type_]] = 1.0
                input_embedding = np.hstack((input_embedding, ner_tag_embedding))

            if pos:
                pos_tag_embedding = np.zeros(pos_dim)
                if pos_tag([word])[0][-1] in pos2index:
                    pos_tag_embedding[pos2index[pos_tag([word])[0][-1]]] = 1.0
                input_embedding = np.hstack((input_embedding, pos_tag_embedding))

            weights_matrix[i + 2] = input_embedding
            words_found += 1

    return weights_matrix, words_found

In [116]:
embed_model.key_to_index

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

In [120]:
weights_matrix, words_found = create_input_embedding(embed_model, pos = True, ner = False)

True False


In [121]:
words_found

33204

In [122]:
word2index['the']

36119

In [123]:
if 'yes' in embed_model.key_to_index:
    print('yes')

yes


In [124]:
embed_model['the']

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [125]:
print(weights_matrix.shape)

(39552, 77)


In [126]:
print(weights_matrix[31499])

[-0.25933999 -0.07604     0.37358001 -1.03009999  0.21642999  0.85126001
 -0.60244    -0.31900001 -0.33676001  0.60354    -0.10049    -1.13909996
  0.43123001  0.0043515   0.72029001 -0.41692999 -0.42166999  0.27452001
 -0.077404   -0.057161    0.13315     1.3369      0.56550002  1.5079
  0.049057   -0.72298002  0.082693   -0.61088002  0.26642999 -0.53430003
  1.66630006  0.93687999  0.12852    -0.88971001  0.53053999  0.33317
 -0.23241    -0.63954002 -0.021537   -0.026442   -0.57050002 -0.017097
  0.78815001  0.58354998  1.44850004 -0.53219998  0.047523   -0.65527999
  0.37983    -0.075898    0.          0.          0.          1.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.        ]


In [127]:
np.save('weights.matrix', weights_matrix)

In [128]:
len(word2index)

39550

**Concatenate Word Embeddings and Features** (unused)

In [129]:
def concat_inputs(data):

    embeddings = []

    for index, sentence in enumerate(data):
        sent_embedding = []

        # Iterate through tokens
        for token in sentence:
            token_embedding = []

            # If the word is in the word embedding model's vocabulary then append its embedding,
            # otherwise an OOV vector of [0.0] * embedding dimension is appended (for simplicity)
            if token in embed_model.key_to_index:
                token_embedding.append([embed_model[token]])
            else:
                token_embedding.append(np.zeros(50))

            # Look up and append tf_idf score to the embedding using the question number and word as the key 
            token_embedding.append(tf_idfs[(index, token)])

            # Append the pos tag to the input embedding
            token_embedding.append(pos_tag([token])[0][-1])

            # Append the token embedding to the sentence embedding
            sent_embedding.append(token_embedding)

        # Append the sentence embedding to the list of all embeddings
        embeddings.append(sent_embedding)
    
    return np.array(embeddings, dtype=object)

### 2.1 - Layers

**Feature Embedding layer**

In [132]:
class AlignQuestionEmbedding(nn.Module):
    
    def __init__(self, input_dim):        
        
        super().__init__()
        
        self.linear = nn.Linear(input_dim, input_dim)
        
        self.relu = nn.ReLU()
        
    def forward(self, context, question, question_mask):
        
        # context = [bs, ctx_len, emb_dim]
        # question = [bs, qtn_len, emb_dim]
        # question_mask = [bs, qtn_len]
    
        ctx_ = self.linear(context)
        ctx_ = self.relu(ctx_)
        # ctx_ = [bs, ctx_len, emb_dim]
        
        qtn_ = self.linear(question)
        qtn_ = self.relu(qtn_)
        # qtn_ = [bs, qtn_len, emb_dim]
        
        qtn_transpose = qtn_.permute(0,2,1)
        # qtn_transpose = [bs, emb_dim, qtn_len]
        
        align_scores = torch.bmm(ctx_, qtn_transpose)
        # align_scores = [bs, ctx_len, qtn_len]
        
        qtn_mask = question_mask.unsqueeze(1).expand(align_scores.size())
        # qtn_mask = [bs, 1, qtn_len] => [bs, ctx_len, qtn_len]
        
        # Fills elements of self tensor(align_scores) with value(-float(inf)) where mask is True. 
        # The shape of mask must be broadcastable with the shape of the underlying tensor.
        align_scores = align_scores.masked_fill(qtn_mask == 1, -float('inf'))
        # align_scores = [bs, ctx_len, qtn_len]
        
        align_scores_flat = align_scores.view(-1, question.size(1))
        # align_scores = [bs*ctx_len, qtn_len]
        
        alpha = F.softmax(align_scores_flat, dim=1)
        alpha = alpha.view(-1, context.shape[1], question.shape[1])
        # alpha = [bs, ctx_len, qtn_len]
        
        align_embedding = torch.bmm(alpha, question)
        # align = [bs, ctx_len, emb_dim]
        
        return align_embedding

**Bi-LSTM Layer**

In [133]:
class StackedBiLSTM(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        
        super().__init__()
        
        self.dropout = dropout
        
        self.num_layers = num_layers
        
        self.lstms = nn.ModuleList()
        
        for i in range(self.num_layers):
            
            if i == 0:
                input_dim = input_dim 
            else:
                input_dim = hidden_dim * 2
            
            self.lstms.append(nn.LSTM(input_dim, hidden_dim,
                                      batch_first=True, bidirectional=True))
           
    
    def forward(self, x):
        # x = [bs, seq_len, feature_dim]

        outputs = [x]
        for i in range(self.num_layers):

            lstm_input = outputs[-1]
            lstm_out = F.dropout(lstm_input, p=self.dropout)
            lstm_out, (hidden, cell) = self.lstms[i](lstm_input)
           
            outputs.append(lstm_out)

    
        output = torch.cat(outputs[1:], dim=2)
        # [bs, seq_len, num_layers*num_dir*hidden_dim]
        
        output = F.dropout(output, p=self.dropout)
      
        return output

**Linear Attention Layer**

In [134]:
class LinearAttentionLayer(nn.Module):
    
    def __init__(self, input_dim):
        
        super().__init__()
        
        self.linear = nn.Linear(input_dim, 1)
        
    def forward(self, question, question_mask):
        
        # question = [bs, qtn_len, input_dim] = [bs, qtn_len, bi_lstm_hid_dim]
        # question_mask = [bs,  qtn_len]
        
        qtn = question.view(-1, question.shape[-1])
        # qtn = [bs*qtn_len, hid_dim]
        
        attn_scores = self.linear(qtn)
        # attn_scores = [bs*qtn_len, 1]
        
        attn_scores = attn_scores.view(question.shape[0], question.shape[1])
        # attn_scores = [bs, qtn_len]
        
        attn_scores = attn_scores.masked_fill(question_mask == 1, -float('inf'))
        
        alpha = F.softmax(attn_scores, dim=1)
        # alpha = [bs, qtn_len]
        
        return alpha
        

def weighted_average(x, weights):
    # x = [bs, len, dim]
    # weights = [bs, len]
    
    weights = weights.unsqueeze(1)
    # weights = [bs, 1, len]
    
    w = weights.bmm(x).squeeze(1)
    # w = [bs, 1, dim] => [bs, dim]
    
    return w

**Attention Layer**

In [135]:
class AttentionLayer(nn.Module):
    
    def __init__(self, context_dim, question_dim, attn_method):
        
        super().__init__()
        
        self.linear = nn.Linear(question_dim, context_dim)

        self.attn_method = attn_method
        
    def forward(self, context, question, context_mask):
        
        # context = [bs, ctx_len, ctx_hid_dim] = [bs, ctx_len, hid_dim*6] = [bs, ctx_len, 768]
        # question = [bs, qtn_hid_dim] = [bs, qtn_len, 768]
        # context_mask = [bs, ctx_len]
        
        # print(self.attn_method)

        if self.attn_method == 'bilinear':
            qtn_proj = self.linear(question)
            qtn_proj = qtn_proj.unsqueeze(2)
            # scores = context.bmm(qtn_proj)
            scores = context.bmm(qtn_proj)
        # qtn_proj = [bs, ctx_hid_dim]
        elif self.attn_method == 'cosine sim':
            qtn_norm = F.normalize(question, dim=-1)  # Normalize the question vectors
            ctx_norm = F.normalize(context, dim=-1)  # Normalize the context vectors
            scores = ctx_norm.bmm(qtn_norm.unsqueeze(2))
        elif self.attn_method == 'scaled dot product':
        # qtn_proj = [bs, ctx_hid_dim, 1]
            scores = context.bmm(question.unsqueeze(2))
        # scores = [bs, ctx_len, 1]
            d_k = question.size(-1)  # Dimension of the query/key vectors
            scores = scores / np.sqrt(d_k)  # Apply scaling by square root of the dimension
        
        scores = scores.squeeze(2)
        # scores = [bs, ctx_len]
        
        scores = scores.masked_fill(context_mask == 1, -float('inf'))
        
        # alpha = nn.LogSoftmax(scores, dim=1)
        # alpha = [bs, ctx_len]
        
        return scores

### 2.2 - Model

In [136]:
class QA_model(nn.Module):
    
    def __init__(self, hidden_dim, embedding_dim, num_layers, num_directions, dropout, device, attn_method):
        
        super().__init__()
        
        self.device = device
        
        #self.embedding = self.get_glove_embedding()
        
        self.context_bilstm = StackedBiLSTM(embedding_dim * 2, hidden_dim, num_layers, dropout)
        
        self.question_bilstm = StackedBiLSTM(embedding_dim, hidden_dim, num_layers, dropout)
        
        self.glove_embedding = self.get_glove_embedding()
        
        def tune_embedding(grad, words=1000):
            grad[words:] = 0
            return grad
        
        self.glove_embedding.weight.register_hook(tune_embedding)
        
        self.align_embedding = AlignQuestionEmbedding(embedding_dim)
        
        self.linear_attn_question = LinearAttentionLayer(hidden_dim*num_layers*num_directions) 

        self.attn_start = AttentionLayer(hidden_dim*num_layers*num_directions, 
                                                          hidden_dim*num_layers*num_directions, attn_method)
        
        self.attn_end = AttentionLayer(hidden_dim*num_layers*num_directions,
                                                        hidden_dim*num_layers*num_directions, attn_method)
        
        self.dropout = nn.Dropout(dropout)

        
    def get_glove_embedding(self):
        
        weights_matrix = np.load('weights.matrix.npy')
        num_embeddings, embedding_dim = weights_matrix.shape
        print(num_embeddings, embedding_dim)
        embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix).to(self.device),freeze=False)

        return embedding
    
    
    def forward(self, context, question, context_mask, question_mask):
       
        # context = [bs, len_c]
        # question = [bs, len_q]
        # context_mask = [bs, len_c]
        # question_mask = [bs, len_q]
        
        
        ctx_embed = self.glove_embedding(context)
        # ctx_embed = [bs, len_c, emb_dim]
        
        ques_embed = self.glove_embedding(question)
        # ques_embed = [bs, len_q, emb_dim]
        

        ctx_embed = self.dropout(ctx_embed)
     
        ques_embed = self.dropout(ques_embed)
             
        align_embed = self.align_embedding(ctx_embed, ques_embed, question_mask)
        # align_embed = [bs, len_c, emb_dim]  
        
        ctx_bilstm_input = torch.cat([ctx_embed, align_embed], dim=2)
        # ctx_bilstm_input = [bs, len_c, emb_dim*2]
                
        ctx_outputs = self.context_bilstm(ctx_bilstm_input)
        # ctx_outputs = [bs, len_c, hid_dim*layers*dir] = [bs, len_c, hid_dim*6]
       
        qtn_outputs = self.question_bilstm(ques_embed)
        # qtn_outputs = [bs, len_q, hid_dim*6]
    
        qtn_weights = self.linear_attn_question(qtn_outputs, question_mask)
        # qtn_weights = [bs, len_q]
            
        qtn_weighted = weighted_average(qtn_outputs, qtn_weights)
        # qtn_weighted = [bs, hid_dim*6]
        
        start_scores = self.attn_start(ctx_outputs, qtn_weighted, context_mask)
        # start_scores = [bs, len_c]
         
        end_scores = self.attn_end(ctx_outputs, qtn_weighted, context_mask)
        # end_scores = [bs, len_c]
        
      
        return start_scores, end_scores

**Training**

In [155]:
device = torch.device("cpu")
print(device)
HIDDEN_DIM = 4
EMB_DIM = 77
NUM_LAYERS = 3
NUM_DIRECTIONS = 2
DROPOUT = 0 # vastly different predictions are made each test if dropout is used
attn_method = 'bilinear'

model = QA_model(HIDDEN_DIM,
                       EMB_DIM, 
                       NUM_LAYERS, 
                       NUM_DIRECTIONS, 
                       DROPOUT, 
                       device,
                       attn_method).to(device)

cpu
39552 77


In [138]:
optimizer = torch.optim.Adamax(model.parameters())

In [139]:
def count_parameters(model):
    '''Returns the number of trainable parameters in the model.'''
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,062,303 trainable parameters


In [140]:
def train(model, train_dataset):
    '''
    Trains the model.
    '''
    
    print("Starting training ........")
    
    train_loss = 0.
    batch_count = 0
    
    # put the model in training mode
    model.train()
    
    # iterate through training data
    for batch in train_dataset:

        # print(batch_count)

        if batch_count % 500 == 0:
            print(f"Starting batch: {batch_count}")
        batch_count += 1

        context, question, context_mask, question_mask, answer_span, ctx, ids = batch
        # context, question, context_mask, question_mask, label, ctx, ans, ids = batch
        # print(context)
        # print(question)
        # print(context_mask)
        # print(question_mask)
        # print(answer_span)
        
        # place the tensors on GPU/CPU
        context, context_mask, question, question_mask, answer_span = context.to(device), context_mask.to(device),\
                                    question.to(device), question_mask.to(device), answer_span.to(device)
        
        # forward pass, get the predictions
        preds = model(context, question, context_mask, question_mask)

        p1, p2 = preds
        # print(p1,p2)
        
        # separate labels for start and end position
        y1, y2 = answer_span[:,0], answer_span[:,1]
        # print(answer_span, ids)
        # print(y1,y2)
        # print(y1, y2)
        # print(p2,y2)
        # for i in range(len(p1[0])):
        #     print(i)
        #     print(max(p1[i]))
        #     print(p1[y1[i]])
        #     break

        # calculate loss
        loss = F.cross_entropy(p1, y1) + F.cross_entropy(p2, y2)
        
        # backward pass, calculates the gradients
        loss.backward()
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        
        # update the gradients
        optimizer.step()
        
        # zero the gradients to prevent them from accumulating
        optimizer.zero_grad()

        train_loss += loss.item()
    

    return train_loss/len(train_dataset)

**Evaluation**

In [141]:
def epoch_time(start_time, end_time):
    '''
    Helper function to record epoch time.
    '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def calculate_precision(predicted_span, ground_truth_span):
    """
    Calculate precision for a predicted span given the ground truth span.

    Args:
        predicted_span (list): Predicted span [start_token, end_token]
        ground_truth_span (list): Ground truth span [start_token, end_token]

    Returns:
        float: Precision score
    """
    # Calculate the number of tokens in the predicted span that are in the ground truth span
    predicted_tokens = set(range(predicted_span[0], predicted_span[1] + 1))
    ground_truth_tokens = set(range(ground_truth_span[0], ground_truth_span[1] + 1))
    intersection = predicted_tokens.intersection(ground_truth_tokens)
    
    # Calculate the precision score
    precision = len(intersection) / len(predicted_tokens)
    
    return precision

def calculate_recall(predicted_span, ground_truth_span):
    """
    Calculate recall for a predicted span given the ground truth span.

    Args:
        predicted_span (list): Predicted span [start_token, end_token]
        ground_truth_span (list): Ground truth span [start_token, end_token]

    Returns:
        float: Recall score
    """
    
    # Calculate the number of tokens in the ground truth span that are in the predicted span
    predicted_tokens = set(range(predicted_span[0], predicted_span[1] + 1))
    ground_truth_tokens = set(range(ground_truth_span[0], ground_truth_span[1] + 1))
    intersection = predicted_tokens.intersection(ground_truth_tokens)
    
    # Calculate the recall score
    recall = len(intersection) / len(ground_truth_tokens)
    
    return recall

def calculate_f1(precision, recall):
    """
    Calculate F1 score given precision and recall.

    Args:
        precision (float): Precision score
        recall (float): Recall score

    Returns:
        float: F1 score
    """
    if precision == 0.0 or recall == 0.0:
        return 0.0
    
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def evaluate(predicted_spans, ground_truth_spans):
    """
    Calculate precision, recall, and F1 score for predicted spans compared to ground truth spans.

    Args:
        predicted_spans (dict): Predicted spans with question IDs as keys and lists of predicted spans as values
        ground_truth_spans (dict): Ground truth spans with question IDs as keys and lists of ground truth spans as values

    Returns:
        tuple: Average precision, recall, and F1 score
    """
    total_precision = 0.0
    total_recall = 0.0
    total_f1 = 0.0
    
    for question_id, predicted_span in predicted_spans.items():
        ground_truth_span = ground_truth_spans[question_id]
        
        best_precision = 0.0
        best_recall = 0.0
        best_f1 = 0.0
        
        # Iterate over each ground truth span
        for gt_span in ground_truth_span:
            # Calculate precision, recall, and F1 score
            precision = calculate_precision(predicted_span, gt_span)
            recall = calculate_recall(predicted_span, gt_span)
            f1 = calculate_f1(precision, recall)
            
            # Update best metrics if necessary
            if precision > best_precision:
                best_precision = precision
            if recall > best_recall:
                best_recall = recall
            if f1 > best_f1:
                best_f1 = f1
        
        # Add the best metrics for this question to the totals
        total_precision += best_precision
        total_recall += best_recall
        total_f1 += best_f1
    
    # Calculate average precision, recall, and F1 score
    avg_precision = total_precision / len(predicted_spans)
    avg_recall = total_recall / len(predicted_spans)
    avg_f1 = total_f1 / len(predicted_spans)
    
    return avg_precision, avg_recall, avg_f1


**Validation**

Create dictionary containing answer spans of the test data for evaluation

In [149]:
answer_span_dict = {}

# Iterate over the rows of the DataFrame
for _, row in wrangled_test_data.iterrows():
    question_id = row['question_id']
    answer_span = row['answer span']
    
    # If the question ID is already present in the dictionary, append the answer span
    if question_id in answer_span_dict:
        answer_span_dict[question_id].append(answer_span)
    # If the question ID is not present, create a new list with the answer span
    else:
        answer_span_dict[question_id] = [answer_span]

print(answer_span_dict)

{'Q0': [[110, 150]], 'Q3': [[0, 0]], 'Q4': [[58, 78]], 'Q20': [[86, 100]], 'Q33': [[15, 50], [50, 63], [63, 104], [104, 150]], 'Q47': [[0, 0]], 'Q49': [[0, 0]], 'Q54': [[0, 0]], 'Q57': [[0, 0]], 'Q59': [[80, 89]], 'Q64': [[117, 156]], 'Q68': [[0, 0]], 'Q69': [[0, 0]], 'Q72': [[0, 0]], 'Q79': [[0, 0]], 'Q81': [[0, 0]], 'Q86': [[0, 0]], 'Q88': [[0, 0]], 'Q91': [[0, 0]], 'Q102': [[0, 36], [36, 73]], 'Q105': [[10, 27]], 'Q113': [[207, 231]], 'Q115': [[0, 0]], 'Q117': [[85, 97]], 'Q123': [[0, 0]], 'Q131': [[0, 0]], 'Q132': [[32, 61], [61, 70]], 'Q144': [[25, 45]], 'Q146': [[47, 62]], 'Q147': [[1, 32], [42, 55]], 'Q149': [[0, 0]], 'Q152': [[0, 0]], 'Q153': [[0, 0]], 'Q157': [[0, 0]], 'Q165': [[12, 54]], 'Q182': [[0, 0]], 'Q186': [[0, 0]], 'Q194': [[0, 0]], 'Q196': [[0, 0]], 'Q198': [[0, 0]], 'Q202': [[0, 0]], 'Q205': [[0, 0]], 'Q216': [[0, 0]], 'Q221': [[0, 17], [17, 57]], 'Q222': [[0, 0]], 'Q224': [[0, 0]], 'Q232': [[0, 0]], 'Q236': [[0, 0]], 'Q242': [[0, 28], [28, 68], [68, 95]], 'Q243': [

In [154]:
def valid(model, valid_dataset):
    '''
    Performs validation.
    '''
    
    print("Starting validation .........")
   
    valid_loss = 0.

    batch_count = 0
    
    # puts the model in eval mode. Turns off dropout
    model.eval()
    
    predictions = {}
    
    for batch in valid_dataset:

        if batch_count % 500 == 0:
            print(f"Starting batch {batch_count}")
        batch_count += 1

        context, question, context_mask, question_mask, answer_span, context_text, ids = batch
        # context, question, context_mask, question_mask, label, context_text, answers, ids = batch

        context, context_mask, question, question_mask, answer_span = context.to(device), context_mask.to(device),\
                                    question.to(device), question_mask.to(device), answer_span.to(device)

        with torch.no_grad():

            preds = model(context, question, context_mask, question_mask)

            p1, p2 = preds

            y1, y2 = answer_span[:,0], answer_span[:,1]
            # print(y1, y2)

            # cross_entropy function combines the softmax operation with the computation of the cross-entropy loss in a single step
            loss = F.cross_entropy(p1, y1) + F.cross_entropy(p2, y2)

            valid_loss += loss.item()

            
            # get the start and end index positions from the model preds
            
            batch_size, c_len = p1.size()
            ls = nn.LogSoftmax(dim=1)
            # mask = torch.ones(c_len, c_len).tril(-1).to(device)  # Lower triangular matrix
            # mask = mask.fill_(float('-inf'))  # Set all elements to negative infinity
            # mask = mask.unsqueeze(0).expand(batch_size, -1, -1)  # Expand mask for each sample in the batch
            mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1).unsqueeze(0).expand(batch_size, -1, -1)
            
            # score = (p1.unsqueeze(2) + p2.unsqueeze(1)) + mask
            score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask
            score, s_idx = score.max(dim=1)
            score, e_idx = score.max(dim=1)
            # s_scores, s_idx = score.max(dim=1)
            # e_scores, e_idx = score.topk(k=1, dim=1)  # Retrieve the top 1 scores and indices along the second dimension
            # e_idx = e_idx.squeeze(1)
            # print(s_idx, e_idx)
            s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze()
            # e_idx = torch.gather(e_idx, 1, s_idx.view(-1, 1)).squeeze()
            # print(s_idx, e_idx)
            # stack predictions
            for i in range(batch_size):
                id = ids[i]
                # print(id, [p1, p2])
                # pred = context[i][s_idx[i]:e_idx[i]+1]

                # pred = ' '.join([index2word[idx.item()] for idx in pred])
                predictions[id] = [s_idx[i].item(), e_idx[i].item()]
                # predictions[id] = [p1, p2]
            
    print(predictions)
    precision, recall, f1 = evaluate(predictions, answer_span_dict)            
    return valid_loss/len(valid_dataset), precision, recall, f1
                

In [None]:
def valid(model, valid_dataset):
    '''
    Performs validation.
    '''
    
    print("Starting validation .........")
   
    valid_loss = 0.

    batch_count = 0
    
    f1, em = 0., 0.
    
    # puts the model in eval mode. Turns off dropout
    model.eval()
    
    predictions = {}
    
    for batch in valid_dataset:

        if batch_count % 500 == 0:
            print(f"Starting batch {batch_count}")
        batch_count += 1

        context, question, context_mask, question_mask, answer_span, context_text, ids = batch
        # context, question, context_mask, question_mask, label, context_text, answers, ids = batch

        context, context_mask, question, question_mask, answer_span = context.to(device), context_mask.to(device),\
                                    question.to(device), question_mask.to(device), answer_span.to(device)

        with torch.no_grad():

            preds = model(context, question, context_mask, question_mask)

            p1, p2 = preds

            y1, y2 = answer_span[:,0], answer_span[:,1]

            # cross_entropy function combines the softmax operation with the computation of the cross-entropy loss in a single step
            loss = F.cross_entropy(p1, y1) + F.cross_entropy(p2, y2)

            valid_loss += loss.item()

            
            # get the start and end index positions from the model preds
            
            batch_size, c_len = p1.size()
            ls = nn.LogSoftmax(dim=1)
            # mask = torch.ones(c_len, c_len).tril(-1).to(device)  # Lower triangular matrix
            # mask = mask.fill_(float('-inf'))  # Set all elements to negative infinity
            # mask = mask.unsqueeze(0).expand(batch_size, -1, -1)  # Expand mask for each sample in the batch
            mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1).unsqueeze(0).expand(batch_size, -1, -1)
            
            # score = (p1.unsqueeze(2) + p2.unsqueeze(1)) + mask
            score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask
            score, s_idx = score.max(dim=1)
            score, e_idx = score.max(dim=1)
            # s_scores, s_idx = score.max(dim=1)
            # e_scores, e_idx = score.topk(k=1, dim=1)  # Retrieve the top 1 scores and indices along the second dimension
            # e_idx = e_idx.squeeze(1)
            print(s_idx, e_idx)
            s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze()
            # e_idx = torch.gather(e_idx, 1, s_idx.view(-1, 1)).squeeze()
            print(s_idx, e_idx)
            # stack predictions
            for i in range(batch_size):
                id = ids[i]
                # print(id, [p1, p2])
                # pred = context[i][s_idx[i]:e_idx[i]+1]

                # pred = ' '.join([index2word[idx.item()] for idx in pred])
                predictions[id] = [s_idx[i].item(), e_idx[i].item()]
                # predictions[id] = [p1, p2]
            
    print(predictions)
    # em, f1 = evaluate(predictions)            
    # return valid_loss/len(valid_dataset), em, f1
                

# 3.Model Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [156]:

train_losses = []
valid_losses = []
ems = []
f1s = []
epochs = 5

for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    
    start_time = time.time()
    
    train_loss = train(model, train_loader)
    valid_loss, precision, recall, f1 = valid(model, test_loader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch train loss : {train_loss}| Time: {epoch_mins}m {epoch_secs}s")
    print(f"Epoch valid loss: {valid_loss}")
    print(f"Epoch Precision: {precision}")
    print(f"Epoch Recall: {recall}")
    print(f"Epoch F1: {f1}")
    print("====================================================================================")
    

Epoch 1
Starting training ........
Starting batch: 0
Starting validation .........
Starting batch 0
{'Q0': [97, 127], 'Q3': [149, 187], 'Q4': [25, 66], 'Q20': [31, 37], 'Q33': [67, 267], 'Q47': [217, 227], 'Q49': [159, 265], 'Q54': [21, 50], 'Q57': [131, 159], 'Q59': [32, 109], 'Q64': [165, 251], 'Q68': [291, 326], 'Q69': [135, 168], 'Q72': [56, 56], 'Q79': [10, 19], 'Q81': [160, 197], 'Q86': [111, 217], 'Q88': [49, 57], 'Q91': [33, 301], 'Q102': [64, 69], 'Q105': [25, 26], 'Q113': [291, 291], 'Q115': [14, 30], 'Q117': [24, 187], 'Q123': [2, 2], 'Q131': [27, 303], 'Q132': [62, 79], 'Q144': [0, 0], 'Q146': [92, 112], 'Q147': [125, 145], 'Q149': [225, 277], 'Q152': [22, 50], 'Q153': [42, 107], 'Q157': [40, 40], 'Q165': [0, 0], 'Q182': [0, 121], 'Q186': [38, 88], 'Q194': [17, 130], 'Q196': [0, 5], 'Q198': [0, 58], 'Q202': [75, 78], 'Q205': [133, 137], 'Q216': [79, 159], 'Q221': [0, 59], 'Q222': [81, 401], 'Q224': [162, 310], 'Q232': [72, 267], 'Q236': [239, 245], 'Q242': [22, 23], 'Q243':

In [145]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

### 3.1. Input Embedding Ablation Study

(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 



### 3.2. Attention Ablation Study
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

### 3.3. Hyper Parameter Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 