# 2023 CITS4012 Assignment
*Make sure you change the file name with your student id.*

# Readme
*If there is something to be noted for the marker, please mention here.* 

*If you are planning to implement a program with Object Oriented Programming style, please check the bottom of the this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [1]:
# Necessary Library Imports
import pandas as pd
import numpy as np
import nltk
import re
import torch
nltk.download('punkt')
from nltk import word_tokenize

# Read in data files
test_df = pd.read_csv("WikiQA-test.tsv", sep="\t")
train_df = pd.read_csv("WikiQA-train.tsv", sep="\t")

# TODO: ask about Word Match Feature: check whether the word appears in the question by using decapitalisation or lemmatization - how can we extract that for the question inputs
# TODO: ask about NER tagging 3g and 4g as (3, g) and (4, g)
# TODO: solve if word not in GloVe model

# notes: 

# spacy word tokenizer can be used for the case of tagging 3g and 4g as (3, g) and (4, g), however it is far less
# efficient - the data wrangling function takes about 210 seconds to run instead of 5 seconds when using NLTKs tokenize

# phrases like 1570–1638 get converted to 15701638 after punctuation removal 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Dictionary for Contraction Removal**

In [2]:
contraction_dict = {"ID": "identify", "Im": "i am", "im": "i am", "Dont": "do not", "dont": "do not", "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", 
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", 
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", 
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", 
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", 
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have",
                    "diff'rent": "different"}


def replace_contractions(sentence):
    words = sentence.split()
    replaced_words = []
    for word in words:
        if word in contraction_dict:
            replaced_words.append(contraction_dict[word])
        else:
            replaced_words.append(word)
    replaced_sentence = " ".join(replaced_words).lower()
    return replaced_sentence


def clean(string):
    '''
    Removes puncutation and replaces contractions from a string

    string: string to clean

    return: cleaned string
    '''
    clean_string = re.sub(r'[^\w\s\']', '', string)
    clean_string = replace_contractions(clean_string)
    clean_string = re.sub(r'[^\w\s]', '', string)
    return clean_string

**Vocab Handling**

In [3]:
def create_vocab_list(dfs:list):
    '''
    Creates index2word and word2index dictionaries for encoding from inputted dataframes. Vocab list 
    contains words from dataframe sentences and questions.
    
    dfs: list of dataframes from dataset.

    returns: word2index and index2word dictionaries
    '''
    
    words = set(['[PAD]', '[OOV]'])

    for df in dfs:
        unique_sentences = list(df['Sentence'].unique())
        for sentence in unique_sentences:
            tokens = word_tokenize(clean(sentence.lower()))
            for token in tokens:
                words.add(token)

        unique_questions = list(df['Question'].unique())
        for question in unique_questions:
            tokens = word_tokenize(clean(question.lower()))
            for token in tokens:
                words.add(token)

    word2index = {w: i+2 for i, w in enumerate(words)}
    word2index['[PAD]'] = 1  
    word2index['[OOV]'] = 0  

    index2word = {index:word for word,index in word2index.items()}

    return word2index, index2word

word2index, index2word = create_vocab_list([train_df, test_df])

**Input Encoding**

In [4]:
def encode(tokens, word2index):
    '''
    Converts tokens into respective idsby mapping ach words using word2index. 

    tokens: list of tokens
    word2index: word to index mapping

    returns: list of mapped indexes
    '''
    
    ids = [word2index[word] for word in tokens]

    return ids

**Data Wrangling Function**

In [5]:
# Read in data files
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")

def wrangle_data(df):
    """Handles data and splits dataframe into question, document and answer set

    df: dataframe to split

    return: question, document, and answer dataframe
    """

    # Initialise variables containing all Questions, Documents and Answers
    questions = []
    documents = []
    answers = {}

    # Get unique questions from the dataframe
    unique_questions = df['Question'].drop_duplicates()

    # Iterate through questions dataframe to form the Questions data
    for row in unique_questions:
        
        # Remove punctuation (except apostrophes for contraction removal)
        cleaned_row = re.sub(r'[^\w\s\']','', row)

        # Replace contracted words
        cleaned_row = replace_contractions(cleaned_row)

        # Tokenise the question
        question_tokens = word_tokenize(cleaned_row.lower())

        # Append the question to the list of questions
        questions.append(question_tokens)

    # Iterate through dataframe and form Documents and Answer data
    for index, row in df.iterrows():

        # Initalise current document pairs list and previous doc number (only for first iteration)
        if index == 0:
            previous_doc_num = row['DocumentID']
            current_doc_pairs = []

        # If the current doc number is different to the previous doc number, then a new  
        # document has been found so append the current document pairs to the list of documents     
        # and reinitilise the current document pairs list
        current_doc_num = row['DocumentID']
        if current_doc_num != previous_doc_num:
            documents.append(current_doc_pairs)
            current_doc_pairs = []
        
        # Replace contracted words
        cleaned_row = replace_contractions(row['Sentence'])

        # Remove punctuation
        cleaned_row = re.sub(r'[^\w\s]','', cleaned_row)

        # Tokenise the current row's sentence
        sent_tokens = word_tokenize(cleaned_row.lower())

        # Assign token types according to 'Label' field (1 means the corresponding sentence
        # is the answer/one of the answers)
        # Token types:
        #   0: token a part of the answer
        #   1: start token of the answer
        #   2: inner token of the answer
        #   3: end token of the answer
        if row['Label']:
            # Answer to question is found
            token_types = np.full(shape=len(sent_tokens), fill_value=2)
            token_types[0] = 1
            token_types[-1] = 3

            # Add the tokenised answer to the answers dictionary. If an answer has already been found.
            # add the tokens to the current answer.
            if row['QuestionID'] in answers:
                answers[row['QuestionID']] += sent_tokens 
            else:
                answers[row['QuestionID']] = sent_tokens
        else:
            # Answer not found 
            token_types = np.zeros(len(sent_tokens), dtype=int)

        # Combine token and corresponding types and append to current document list
        token_type_pairs = list(zip(sent_tokens, token_types))
        current_doc_pairs.append(token_type_pairs)

        # Update previous document number 
        previous_doc_num = current_doc_num

        # If the iterated row is the final row in the dataframe, append the current document
        # to the documents array
        if index == len(df)-1:
            documents.append(current_doc_pairs)

    return questions, documents, answers

train_questions, train_documents, train_answers = wrangle_data(train_data)
test_questions, test_documents, test_answers = wrangle_data(test_data)


In [21]:
def wrangle_data(df):
    """
    Handles data wrangling and returns a dataframe giving each question the corresponding document and
    answer span set. Encodes the words into their index of the vocab list as well.

    df: dataframe to wrangle

    return: dataframe containing questions with the corresponding document and answer span set (encoded).
    """

    # Initialise variable containing Answer Spans (questions can have 0, 1, or multiple answers)
    answer_spans = [[-1,-1]]

    # Intialise df object that will be returned after wrangling
    wrangled_df = pd.DataFrame(columns=['question', 'document', 'answer span'])


    # Iterate through dataframe and form Documents and Answer Span data
    for index, row in df.iterrows():
        # print(index)

        # Initalise current document tokens list, length and previous doc number (only for first iteration)
        if index == 0:
            previous_doc_num = row['DocumentID']
            current_doc_length = 0
            current_doc_tokens = []


        # If the current doc number is different to the previous doc number, then a new  
        # document has been found 
        current_doc_num = row['DocumentID']
        if current_doc_num != previous_doc_num:

            # Remove punctuation from question and replaces contractions
            cleaned_qn = clean(previous_qn)

            # Tokenize the question 
            qn_tokens = word_tokenize(cleaned_qn.lower())
                
            for span in answer_spans:
                # Create a new row with multiple values
                # print(current_doc_tokens)
                new_row = {'question': [encode(qn_tokens, word2index)],
                           'document':[encode(current_doc_tokens, word2index)],
                           'answer span': [span]}
                
                # Convert the new row to a DataFrame
                new_row_df = pd.DataFrame(new_row)

                # Append the new row DataFrame to the original DataFrame
                wrangled_df = pd.concat([wrangled_df, new_row_df], ignore_index=True)

            # Reinitialise document tokens list, current document length, and answer spans
            current_doc_tokens = []
            current_doc_length = 0
            answer_spans = [[-1,-1]]


        # Remove punctuation from document sentence and replace contracted words from document sentence 
        cleaned_sent = clean(row['Sentence'])

        # Tokenise the current row's sentence and append to the document tokens list
        sent_tokens = word_tokenize(cleaned_sent.lower())
        current_doc_tokens.extend(sent_tokens)


        if row['Label']:
            # Answer to question is found

            # Find the span of the answer
            span_start = current_doc_length
            span_end = span_start + len(sent_tokens)

            # Add answer span to current document's answer spans
            if answer_spans[0] == [-1,-1]:
                answer_spans[0] = [span_start, span_end]
            else:
                answer_spans.append([span_start, span_end])
            
            # Increase length of current document
            current_doc_length += len(sent_tokens)

        else:
            # Answer not found 

            # Increase length of current document
            current_doc_length += len(sent_tokens)

        # Update previous document number and question
        previous_doc_num = current_doc_num
        previous_qn = row['Question']

        # If the iterated row is the final row in the dataframe, add the final question and 
        # document to the wrangled df
        if index == len(df)-1:

            # Remove punctuation from question and replaces contractions
            cleaned_qn = clean(previous_qn)

            # Tokenize the question 
            qn_tokens = word_tokenize(cleaned_qn.lower())

            for span in answer_spans:
                # Create a new row with multiple values
                new_row = {'question': [encode(qn_tokens, word2index)],
                           'document': [encode(current_doc_tokens, word2index)],
                           'answer span': [span]}

                # Convert the new row to a DataFrame
                new_row_df = pd.DataFrame(new_row)

                # Append the new row DataFrame to the original DataFrame
                wrangled_df = pd.concat([wrangled_df, new_row_df], ignore_index=True)

    return wrangled_df

wrangled_train_data = wrangle_data(train_df)
wrangled_test_data = wrangle_data(test_df)

In [22]:
wrangled_train_data

Unnamed: 0,question,document,answer span
0,"[7955, 11503, 21430, 15290, 15441]","[33078, 32265, 19296, 21430, 21519, 22055, 331...","[24, 37]"
1,"[7955, 11503, 8429, 2070, 25424, 8429, 38743, ...","[5037, 16844, 8341, 38532, 31937, 33078, 3534,...","[-1, -1]"
2,"[7955, 2101, 21464, 19141, 2786]","[21464, 19141, 31937, 33078, 9836, 27726, 2309...","[-1, -1]"
3,"[7955, 48, 31937, 8429, 5530, 9880, 32465, 16117]","[5037, 8429, 17824, 32552, 8429, 24894, 25424,...","[-1, -1]"
4,"[7955, 33078, 14207, 27060, 35171, 1528, 30185...","[8429, 14207, 7844, 38149, 31937, 33078, 34945...","[-1, -1]"
...,...,...,...
2279,"[7343, 30289, 29136, 34088, 24407]","[29136, 2002, 26593, 29136, 34088, 2002, 31937...","[76, 94]"
2280,"[34816, 31937, 26282, 5560, 3120]","[26282, 3176, 3120, 5037, 8429, 19903, 1998, 2...","[7, 42]"
2281,"[34816, 31937, 26282, 5560, 3120]","[26282, 3176, 3120, 5037, 8429, 19903, 1998, 2...","[42, 70]"
2282,"[34816, 31937, 8429, 10211, 30254, 25424, 6475]","[32558, 3111, 19445, 30254, 30773, 16226, 1289...","[-1, -1]"


**Data Loader**

In [23]:
import spacy
nlp = spacy.load("en_core_web_sm")

class DataLoader:
    '''
    -Divides the dataframe in batches.
    -Pads the contexts and questions dynamically for each batch by padding 
     the examples to the maximum-length sequence in that batch.
    -Calculates masks for context and question.
    -Calculates spans for contexts.
    '''
    
    def __init__(self, data, batch_size):
        
        self.batch_size = batch_size
        data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
        self.data = data
    
    def get_span(self, text):
        
        text = nlp(text, disable=['parser','tagger','ner'])
        span = [(w.idx, w.idx+len(w.text)) for w in text]

        return span

    def __len__(self):
        return len(self.data)
    
    def __iter__(self):
        '''
        Creates batches of data and yields them.
        
        Each yield comprises of:
        :padded_context: padded tensor of contexts for each batch 
        :padded_question: padded tensor of questions for each batch 
        :context_mask & question_mask: zero-mask for question and context
        :label: start and end index wrt context_ids
        :context_text,answer_text: used while validation to calculate metrics
        :context_spans: spans of context text
        :ids: question_ids used in evaluation
        '''
        
        for batch in self.data:

            # print(batch)
                            
            # spans = []
            context_text = []
            # answer_text = []
            
            max_context_len = max([len(document) for document in batch['document']])
            padded_context = torch.LongTensor(len(batch), max_context_len).fill_(1)
            # print(max_context_len)
            
            for ctx in batch['document']:
                context_text.append(ctx)
                # spans.append(self.get_span(ctx))
            
            # for ans in batch.answer:
            #     answer_text.append(ans)
                
            for i, ctx in enumerate(batch['document']):
                # print(i, len(ctx), ctx)
                padded_context[i, :len(ctx)] = torch.LongTensor(ctx)

            print(padded_context)
            
            max_question_len = max([len(ques) for ques in batch['question']])
            padded_question = torch.LongTensor(len(batch), max_question_len).fill_(1)
            
            for i, ques in enumerate(batch['question']):
                padded_question[i,: len(ques)] = torch.LongTensor(ques)
                
            
            label = torch.LongTensor(list(batch['answer span']))
            context_mask = torch.eq(padded_context, 1)
            question_mask = torch.eq(padded_question, 1)
            
            # ids = list(batch.id)  
            yield (padded_context, padded_question, context_mask, 
                   question_mask, label, context_text)
            
            # yield (padded_context, padded_question, context_mask, 
            #        question_mask, label, context_text, answer_text, ids)

In [24]:
train_loader = DataLoader(wrangled_train_data, 32)

In [25]:
a = next(iter(train_loader))

tensor([[33078, 32265, 19296,  ...,     1,     1,     1],
        [ 5037, 16844,  8341,  ...,     1,     1,     1],
        [21464, 19141, 31937,  ...,     1,     1,     1],
        ...,
        [27121, 13886, 30246,  ...,     1,     1,     1],
        [ 8429, 38416, 26180,  ...,     1,     1,     1],
        [12724, 17543, 31937,  ...,     1,     1,     1]])


In [26]:
a[0].shape, a[1].shape, a[2].shape, a[3].shape, a[4].shape

(torch.Size([32, 579]),
 torch.Size([32, 15]),
 torch.Size([32, 579]),
 torch.Size([32, 15]),
 torch.Size([32, 2]))

In [27]:
print(a[2][0])

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, 

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

total = 0
for qn in train_questions:
    total += len(qn)
print(total)
train_questions

docs = []
total_2 = 0

for sentence in test_questions:
    text = " ".join(sentence)
    doc = nlp(text)
    docs.append(doc)

for i in range(len(test_questions)):
    # for x in doc:
    #     total_2 += 1
    #     print(total_2, x)
    if len(test_questions[i]) != len(docs[i]):
        print(test_questions[i], len(test_questions[i]))
        print([x for x in docs[i]], len(docs[i]))
    # total_2 += len(train_questions[i])
    # print(total_2)

14724
['what', 'band', 'will', 'att', '4g', 'network', 'be', 'on'] 8
[what, band, will, att, 4, g, network, be, on] 9


In [28]:
# for debugging - find all labels equal to 1
train_data.loc[train_data['Label'] == 1]

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
75,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-0,This tablespoon has a capacity of about 15 mL.,1
83,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-8,In the USA one tablespoon (measurement unit) i...,1
84,Q16,how much is 1 tablespoon of water,D16,Tablespoon,D16-9,In Australia one tablespoon (measurement unit)...,1
98,Q17,how much are the harry potter movies worth,D17,Harry Potter,D17-13,The series also originated much tie-in merchan...,1
...,...,...,...,...,...,...,...
20292,Q3037,What is an economic feature?,D2802,Economics,D2802-9,"At the turn of the 21st century, the expanding...",1
20307,Q3039,what is the average american income,D1876,Household income in the United States,D1876-6,"U.S. median household income fell from $51,144...",1
20325,Q3042,When was Apple Computer founded,D2806,Apple Inc.,D2806-3,"The company was founded on April 1, 1976, and ...",1
20335,Q3043,what is section eight housing,D2807,Section 8 (housing),D2807-1,"Section 8 of the Housing Act of 1937 (), often...",1


# 2.QA Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

**GloVe/Word2VeC word embedding**

In [29]:
import gensim.downloader as api

# model = api.load("word2vec-google-news-300")
embed_model = api.load("glove-wiki-gigaword-50")

def create_word_embedding(model):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    '''
    weights_matrix = np.zeros((len(word2index), 50))
    words_found = 0
    for i, word in enumerate(word2index.keys()):
        print(i, word)
        try:
            weights_matrix[i] = model[word]
            words_found += 1
        except:
            pass
    return weights_matrix, words_found

In [31]:
weights_matrix, words_found = create_word_embedding(embed_model)

0 15701638
1 sheltie
2 shrubs
3 hornworts
4 diacetate
5 prosecute
6 misunderstandings
7 arcadestyle
8 inferiorly
9 bits
10 origin
11 barnard
12 ytv
13 7133136
14 sandlers
15 combe
16 urolophidae
17 xr
18 commanders
19 solas
20 warkuss
21 prelude
22 stranger
23 vate
24 arlene
25 thrombocytosis
26 montreal
27 calculations
28 1140
29 nonsense
30 ipity
31 premisses
32 comparatively
33 cristoforo
34 shrine
35 hauser
36 coastline
37 architects
38 qualifying
39 spence
40 chaitins
41 captives
42 pad
43 matthean
44 cadre
45 usace
46 long
47 plums
48 privateindustry
49 interpersonal
50 ballads
51 dramatizes
52 approximating
53 abstract
54 degrade
55 stimulus
56 con
57 pays
58 worker
59 us588
60 overexercising
61 unitingcare
62 persuade
63 proposed
64 voiceover
65 lending
66 445
67 grouped
68 trotskyists
69 ashcroft
70 grainger
71 assess
72 barrelhouse
73 rbi
74 harbored
75 hédi
76 receptions
77 vacuoles
78 medication
79 kuch
80 brigades
81 microcalorimeters
82 molluscs
83 consumed
84 inspected
8

In [16]:
word2index['7133136']

15

In [20]:
print(weights_matrix[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


In [157]:
len(word2index)

39550

In [17]:
embed_model['7133136']

KeyError: "Key '7133136' not present"

**TFIDF (Term Frequency Inverse Document Frequency)**

In [12]:
def calculate_df(tokenized_data):
    """ Calculate term frequencies for a given list of list of tokens

    tokenized_data: tokenized dataset

    return: dictionary of frequencies for each term in the dataset
    """
    DF = {}

    for token_vector in tokenized_data:
        # get each unique word in the doc and count the number of occurences in the document
        for term in np.unique(token_vector):
            try:
                DF[term] += 1
            except:
                DF[term] = 1

    # print scores in descending order
    sorted_dict = sorted(DF.items(), key = lambda x : x[1], reverse = True)
    #print(sorted_dict)
    
    return DF

In [13]:
# calculate TFIDF scores

from collections import Counter
import math

def calculate_tf_idf(tokenized_data):

    tf_idf = {}

    # total number of documents
    n = len(tokenized_data)

    # calculate Document Frequencies
    DF = calculate_df(tokenized_data)

    doc_id = 0
    # get each token vector
    for token_vector in tokenized_data:
        # initialise counter for the vector
        counter = Counter(token_vector)
        #calculate total number of words in the doc
        total_num_words = len(token_vector)

        # get each unique word in the doc
        for term in np.unique(token_vector):

            # calculate Term Frequency
            tf = counter[term]/total_num_words

            # calculate Document Frequency
            df = DF[term]

            # calculate Inverse Document Frequency
            idf = math.log(n/(df+1))+1

            # calcaulte TF-IDF
            tf_idf[doc_id, term] = tf*idf

        doc_id += 1

    return tf_idf

tf_idfs = calculate_tf_idf(train_questions)
tf_idfs


{(0, 'are'): 0.643935192442334,
 (0, 'caves'): 1.5929216181149841,
 (0, 'formed'): 1.315662745891006,
 (0, 'glacier'): 1.5929216181149841,
 (0, 'how'): 0.531268078634743,
 (1, 'a'): 0.21464506414744466,
 (1, 'and'): 0.29300502627290326,
 (1, 'are'): 0.21464506414744466,
 (1, 'circular'): 0.5309738727049946,
 (1, 'directions'): 0.5309738727049946,
 (1, 'force'): 0.4474563414719701,
 (1, 'how'): 0.17708935954491434,
 (1, 'in'): 0.17529845006027778,
 (1, 'motion'): 0.484764060667665,
 (1, 'of'): 0.18878367758362766,
 (1, 'related'): 0.484764060667665,
 (1, 'the'): 0.2693996319800544,
 (1, 'vectors'): 0.5309738727049946,
 (1, 'velocity'): 0.5309738727049946,
 (2, 'apollo'): 1.5929216181149841,
 (2, 'creed'): 1.5929216181149841,
 (2, 'did'): 0.6985938594422703,
 (2, 'die'): 1.11334256355531,
 (2, 'how'): 0.531268078634743,
 (3, 'federal'): 0.838980640259944,
 (3, 'for'): 0.484828972252917,
 (3, 'how'): 0.3320425491467144,
 (3, 'is'): 0.2446280201983818,
 (3, 'judges'): 0.995576011321865,
 (

**POS Tagging**

In [14]:
from nltk.tag import pos_tag_sents
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [15]:
total = 0
for sent in pos_tag_sents(train_questions):
    total += len(sent)

print(total)
pos_tag_sents(train_questions)

14724


[[('how', 'WRB'),
  ('are', 'VBP'),
  ('glacier', 'JJ'),
  ('caves', 'NNS'),
  ('formed', 'VBD')],
 [('how', 'WRB'),
  ('are', 'VBP'),
  ('the', 'DT'),
  ('directions', 'NNS'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('velocity', 'NN'),
  ('and', 'CC'),
  ('force', 'NN'),
  ('vectors', 'NNS'),
  ('related', 'VBN'),
  ('in', 'IN'),
  ('a', 'DT'),
  ('circular', 'JJ'),
  ('motion', 'NN')],
 [('how', 'WRB'),
  ('did', 'VBD'),
  ('apollo', 'VB'),
  ('creed', 'NN'),
  ('die', 'NN')],
 [('how', 'WRB'),
  ('long', 'JJ'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('term', 'NN'),
  ('for', 'IN'),
  ('federal', 'JJ'),
  ('judges', 'NNS')],
 [('how', 'WRB'),
  ('a', 'DT'),
  ('beretta', 'NN'),
  ('model', 'NN'),
  ('21', 'CD'),
  ('pistols', 'NNS'),
  ('magazines', 'NNS'),
  ('works', 'NNS')],
 [('how', 'WRB'), ('a', 'DT'), ('vul', 'NN'), ('works', 'NN')],
 [('how', 'WRB'),
  ('an', 'DT'),
  ('outdoor', 'JJ'),
  ('wood', 'NN'),
  ('boiler', 'NN'),
  ('works', 'NNS')],
 [('how', 'WRB'),
  ('big', 'JJ'),
  ('

In [16]:
from nltk.tag import pos_tag
pos_tag(['how'])[0][-1]

'WRB'

**Named Entity Recognition**

In [17]:
# ! python -m spacy validate
import spacy
nlp = spacy.load("en_core_web_sm")

# article = nlp(list(train_data['Question']))
# print(article)

In [18]:
def ner_tag_sents(data):

    docs = []
    total = 0
    total_2 = 0

    for sentence in data:
        text = " ".join(sentence)
        doc = nlp(text)
        docs.append(doc)

    # for doc in docs:
    #     for entity in doc.ents:
    #         print(entity.text, entity.label_)

    # print(docs)

    # for doc in docs:
    #     print([(x, x.ent_iob_, x.ent_type_) for x in doc])
    #     total += 1
    #     print(total)
    #     total_2 += len(doc)
    #     print(total_2)


ner_tag_sents(train_questions)
# print(len(ner_tags))
# for sent in ner_tags:
#     print(sent)
#     total +=1
#     print(total)

**Concatenate Word Embeddings and Features**

In [33]:
def concat_inputs(data):

    embeddings = []

    for index, sentence in enumerate(data):
        sent_embedding = []

        # Iterate through tokens
        for token in sentence:
            token_embedding = []

            # If the word is in the word embedding model's vocabulary then append its embedding,
            # otherwise an OOV vector of [0.0] * embedding dimension is appended (for simplicity)
            if token in embed_model.key_to_index:
                token_embedding.append([embed_model[token]])
            else:
                token_embedding.append(np.zeros(50))

            # Look up and append tf_idf score to the embedding using the question number and word as the key 
            token_embedding.append(tf_idfs[(index, token)])

            # Append the pos tag to the input embedding
            token_embedding.append(pos_tag([token])[0][-1])

            # Append the token embedding to the sentence embedding
            sent_embedding.append(token_embedding)

        # Append the sentence embedding to the list of all embeddings
        embeddings.append(sent_embedding)
    
    return np.array(embeddings, dtype=object)

In [35]:
train_qn_inputs = concat_inputs(train_questions)
print(train_qn_inputs[0][0])

[[array([ 6.8938e-01, -1.0644e-01,  1.7083e-01, -3.7583e-01,  7.5170e-01,
        7.8149e-04, -5.3102e-01, -1.9903e-01, -1.4419e-01,  1.2748e-01,
       -2.8038e-01,  7.0723e-01, -5.4100e-01,  1.9625e-01,  9.6635e-01,
        6.0519e-01,  4.0918e-01, -3.1612e-02,  5.3900e-01, -8.7086e-01,
       -2.0912e-01,  5.6853e-01,  6.5983e-01,  1.4583e-01,  1.0112e+00,
       -2.0736e+00, -1.1242e+00,  5.9662e-04,  7.0332e-01, -8.2608e-01,
        3.4445e+00,  3.2984e-01, -3.5324e-01, -1.0335e+00, -1.4753e-01,
       -1.4874e-01, -4.1246e-01,  3.3489e-01,  1.9841e-01, -2.5478e-01,
       -4.7193e-01,  6.6701e-02,  3.2777e-01,  6.8781e-01,  3.6428e-01,
        2.1522e-01,  1.6494e-01,  4.1761e-01, -2.2504e-01,  6.1412e-01],
      dtype=float32)], 0.531268078634743, 'WRB']


In [32]:
train_doc_inputs = concat_inputs(train_documents)
print(train_doc_inputs[0])

TypeError: unhashable type: 'list'

# 3.Model Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [None]:
#
#

###3.1. Input Embedding Ablation Study

(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 



###3.2. Attention Ablation Study
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

###3.3. Hyper Parameter Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 