# 2023 CITS4012 Assignment

# Readme

**Installing Packages**

Some basic package installations are needed for some modules to be imported.
### List of packages used (version):
nltk == 3.8.1  
re == 2.2.1  
torch == 2.0.1+cu118  
numpy == 1.22.4  
pandas == 1.5.3  
en_core_web_sm == 3.5.0

**File Locations**

The default file locations are given as './Data/WikiQA-train.tsv' and './Data/WikiQA-test.tsv'. Need to make the necessary change in the file locations as per the requirements.

##### Installing Packages

In [1]:
# Installing spacy for nltk
!pip install nltk

# Installing spacy for Named Entity Tagging
!pip install spacy

# Downloading the pre-trained NLP Model for Named Entity Tagging
!python -m spacy download en_core_web_sm

Traceback (most recent call last):
  File "/Users/naufaln/opt/miniconda3/envs/cits5508-2022/lib/python3.8/runpy.py", line 185, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/Users/naufaln/opt/miniconda3/envs/cits5508-2022/lib/python3.8/runpy.py", line 144, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "/Users/naufaln/opt/miniconda3/envs/cits5508-2022/lib/python3.8/runpy.py", line 111, in _get_module_details
    __import__(pkg_name)
  File "/Users/naufaln/opt/miniconda3/envs/cits5508-2022/lib/python3.8/site-packages/spacy/__init__.py", line 6, in <module>
  File "/Users/naufaln/opt/miniconda3/envs/cits5508-2022/lib/python3.8/site-packages/spacy/errors.py", line 2, in <module>
    from .compat import Literal
  File "/Users/naufaln/opt/miniconda3/envs/cits5508-2022/lib/python3.8/site-packages/spacy/compat.py", line 3, in <module>
    from thinc.util import copy_array
  File "/Users/naufaln/opt/minic

In [2]:
# To overrie the error while installing en_core_web_sm
import os

os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

In [3]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/naufaln/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/naufaln/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/naufaln/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/naufaln/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#### Importing Libraries

In [4]:
import re
import time
import math
import torch

import numpy as np
import pandas as pd

# For data processing
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

# For Named Enity Tagging
import spacy
import en_core_web_sm
from collections import Counter

# For Modelling
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from numpy.linalg import norm

from sklearn.metrics import classification_report

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



In [5]:
# Enabling GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Functions

#### Function to format the Data Frame

In [6]:
def shrinkColumns(df):
    '''Function to convert the given data to the required format'''
    new_df = pd.DataFrame(columns=['QuestionID', 'Question', 'Document', 'Answer'])

    for qid in df['QuestionID'].unique():
        # Get the first question associated with this QuestionID
        first_question = df.loc[df['QuestionID'] == qid, 'Question'].iloc[0]
        
        # Get all sentences associated with this QuestionID
        sentences = df.loc[df['QuestionID'] == qid, 'Sentence']
        concatenated_sentence = ' '.join(sentences)
        
        # Get the sentence associated with this QuestionID where the Label is 1
        answer = df.loc[(df['QuestionID'] == qid) & (df['Label'] == 1), 'Sentence']

        if not answer.empty:
            answer = answer.iloc[0]
        else:
            answer = ""
        
        # Add the QuestionID, first_question, concatenated_sentence, and answer to the new dataframe
        new_row = {'QuestionID': qid, 'Question': first_question, 'Document': concatenated_sentence, 'Answer': answer}
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)

    return new_df

#### Function for labelling the document tokens

In [7]:
def generateLabels(padded_document, answer, len_org_document):
    '''The Function generates three labels: [Answer], [Not Answer] and [PAD], depending on the position of
        the answer in the document'''
    labels = ["[Not Answer]" for i in range(len(padded_document))]

    # Generationg the labels for all the documents with non emply answers
    if answer != "":
        start_index = [i for i in range(len(padded_document)-len(answer)+1) if padded_document[i:i+len(answer)] == answer]
        if start_index:
            start_index = start_index[0]
            end_index = start_index + len(answer)
            labels[start_index] = '[Answer]'
            for j in range(start_index+1, end_index):
                labels[j] = '[Answer]'
            labels[end_index-1] = '[Answer]'
            
    # labelling the padding
    for i in range(len_org_document, len(padded_document)):
        labels[i] = "[Pad]"
    return labels

#### Function for tokenising a sentance

In [8]:
def tokenize(sentance):
    '''Function for tokenising a sentance'''
    sent_text=[]
    normalized_text = []
    sent_text.extend(word_tokenize(sentance))
    
    # Removing punctuation and changing all characters to lower case
    for string in sent_text:
        tokens = re.sub(r"[^a-z0-9.]+", '', string.lower())
        normalized_text.append(tokens)

    final_text = []
    for text in normalized_text:
        if text != '':
            final_text.append(text)

    return final_text

#### Function for Tokenising a list of sentances

In [9]:
def tokenizeList(sequences):
    '''Function for tokenising a list of sentance or a document'''
    tokenized_list = []
    for seq in sequences:
        tokenized_list.append(tokenize(seq))

    return tokenized_list

#### Function for word embedding a sentance (Using Word2Vec - Skip Gram Model)

In [10]:
def word2Vec(sentance):
    '''Function to convert a sentance from words to vector (of size 50), using Skip Gram method'''
    wv_sg_model = Word2Vec([sentance], vector_size=50, window=3, min_count=1, workers=2, sg=1)

    word_2_vec = list()

    # Vectorising each word of the sentance
    for word in sentance:
        word_2_vec.append(wv_sg_model.wv[word])
    return word_2_vec


#### Function for word embedding a document

In [11]:
def word2VecDocuments(document):
    '''Function to vectorise the words of sentances in a document'''
    word_2_vec = list()
    for sentance in document:
        word_2_vec.append(word2Vec(sentance))
    return word_2_vec


#### Function to add padding to the sequences

In [12]:
def padSequences(sequences, length):
    '''Function for padding the sequences to the length supplied'''
    padded_sequences = list()

    for seq in sequences:
        num_padding = length - len(seq)
        padded_seq = seq + ['[PAD]'] * num_padding
        padded_sequences.append(padded_seq)
    
    return padded_sequences

#### Function to find the TF-IDF values

In [13]:
def tfIdf(tokens, length):
    '''Function for finding the TF-IDF Values of a word in the document'''
    tf_idf_list = list()
    DF = {}
    for term in np.unique(tokens):
        try:
            DF[term] +=1
        except:
            DF[term] =1

    tf_idf = []
    N = len(tokens) 
    doc_id = 0
    counter = Counter(tokens)
    total_num_words = len(tokens) 

    # Calculationg the TF-IDF value for the words
    for term in tokens[0:length]:
        tf = counter[term]/total_num_words
        df = DF[term]
        idf = math.log(N/(df+1))+1
        tf_idf.append(tf*idf)

    # Giving the TF-IDF values for padding as '0'
    for term in range(length,len(tokens)):
        tf_idf.append(0)

    doc_id += 1
    tf_idf_list.append(tf_idf)

    return tf_idf_list[0]

#### Function to get POS tags

In [14]:
def posTagging(tokens, length):
    '''Function for calculating the POS tags of words in the documents'''
    tagged_words = pos_tag(tokens[0:length])
    _, tags_list = zip(*tagged_words)
    tags_list = list(tags_list)

    # Tagging [PAD] as [PAD]
    for i in range(length,len(tokens)):
        tags_list.append('[PAD]')
    return tags_list

#### Function to find the Named Entity Tags

In [15]:
def nerTagging(document):
    '''Function for calculating the Named Entity tags for Words in the document '''
    NE_Tag_table = []
    tokens = []
    
    # loading pre-trained model of NER
    entity_tagging_model = en_core_web_sm.load()

    article = entity_tagging_model(document)
    sentences = [x for x in article.sents]
    for sentence in sentences:
        for word in sentence:
            NE_Tag_table.append(str(word.ent_type_))
            tokens.append(str(word).lower())
    for i in range(len(NE_Tag_table)):
        if(NE_Tag_table[i] == ''):
            NE_Tag_table[i] = "O"

    return tokens, NE_Tag_table

#### Function to get the wordnet POS tag and convert to use with lemmatizer

In [16]:
def getWordnetPos(tags):
    '''Function to get the Wordnet POS tags which is used to lemmettize the words'''
    if tags.startswith('J'):
        return 'a'  # Adjective
    elif tags.startswith('V'):
        return 'v'  # Verb
    elif tags.startswith('N'):
        return 'n'  # Noun
    elif tags.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'

#### Function to Lemmattize the words using the POS tags

In [17]:
def lemmatization(tokens, tags):
    '''Function to lemmettize the words considering the POS tags'''
    lemmatizer = WordNetLemmatizer()
    lemmitized = [lemmatizer.lemmatize(tokens[ind], pos=getWordnetPos(tags[ind])) for ind in range(len(tokens))]  
    return lemmitized

#### Function to Preprocess the Questions list

In [18]:
def questionPreprocess(question_list):
    '''Function to pre-process the questions'''
    question_tokens = tokenizeList(question_list)

    MAX_LENGTH = max([len(s) for s in question_tokens])
    
    #length of 97% of the data
    length = round(0.7*MAX_LENGTH)

    question_tokens_padded = padSequences(question_tokens, MAX_LENGTH)
    embedded_question_list = word2VecDocuments(question_tokens_padded)
    question_batch_torch = torch.from_numpy(np.array(embedded_question_list)).float().to(device)
    return question_batch_torch, MAX_LENGTH


In [19]:
def getNERTags(document_list):
    '''Function to find the NER tags of the document'''
    document_tokens = []
    NER_tags = []

    for document in document_list:
        tokens, tags = nerTagging(document)
        document_tokens.append(tokens)
        NER_tags.append(tags)

    MAX_LENGTH = max([len(s) for s in document_tokens])

    # NER Tags for the padding
    for ind in range(len(document_list)):
        tag_len = len(NER_tags[ind])
        for i in range(tag_len,MAX_LENGTH):
            NER_tags[ind].append("[PAD]")

    return document_tokens, NER_tags
    

In [20]:
def documentPreprocessing(document_list, option):
    '''Function to pre-process the list of documents and return a torch vector'''
    
    document_tokens, ner_tags = getNERTags(document_list)

    MAX_LENGTH = max([len(s) for s in document_tokens])

    # padding documents with maximum length
    document_tokens_padded = padSequences(document_tokens, MAX_LENGTH)

    # Word Embeddings
    pos_tags = []
    lem_document_tokens = []
    tf_idf = []
    for ind in range(len(document_tokens_padded)):
        len_org_document = len(document_tokens[ind])
        # POS tagging
        tags = posTagging(document_tokens_padded[ind],len_org_document)
        pos_tags.append(tags)

        # Lemmatization
        lem_document_tokens.append(lemmatization(document_tokens_padded[ind],tags))

        # TF-IDF
        tf_idf.append(tfIdf(lem_document_tokens[ind],len_org_document))

    # Word to Vector
    embedded_document_list = word2VecDocuments(lem_document_tokens)
    embedded_pos_tags = word2VecDocuments(pos_tags)
    embedded_NER_tags = word2VecDocuments(ner_tags)

    document_vector = []
    if(option == 1):
        # Just Word2Vec
        for i in range(len(embedded_document_list)):
            embedded_document = embedded_document_list[i]
            
            token_vector =  []
            for j in range(len(embedded_document)):
                vector = []
                vector.extend(embedded_document[j])
                token_vector.append(np.array(vector))
            document_vector.append(token_vector)
    elif(option == 2):
        # word2Vec, pos tags
        for i in range(len(embedded_document_list)):
            embedded_document = embedded_document_list[i]
            embedded_pos = embedded_pos_tags[i]
            
            token_vector =  []
            for j in range(len(embedded_document)):
                vector = []
                vector.extend(embedded_document[j])
                vector.extend(embedded_pos[j])
                token_vector.append(np.array(vector))
            document_vector.append(token_vector)
    elif(option == 3):
        # word2Vec, pos tagging, NER tagging and TF-IDF
        for i in range(len(embedded_document_list)):
            embedded_document = embedded_document_list[i]
            embedded_pos = embedded_pos_tags[i]
            embedded_NER = embedded_NER_tags[i]
            tfidf = tf_idf[i]
            
            token_vector =  []
            for j in range(len(embedded_document)):
                vector = []
                vector.extend(embedded_document[j])
                vector.extend(embedded_pos[j])
                vector.extend(embedded_NER[j])
                vector.append(tfidf[j])
                token_vector.append(np.array(vector))
            document_vector.append(token_vector)

    document_vector_torch = torch.from_numpy(np.array(document_vector)).float().to(device)
    return document_tokens_padded, document_vector_torch, MAX_LENGTH

##### Helper functions for displaying the time

In [21]:
def asMinutes(s):
    '''Function for converting the seconds into minutes'''
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    '''Sunction to find the estimated and the remaining time'''
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# 1.DataSet Processing

##### Importing Data Sets

In [22]:
# Reading the data from the local disk
training_data = pd.read_csv('./Data/WikiQA-train.tsv', sep='\t')
test_data = pd.read_csv('./Data/WikiQA-test.tsv', sep='\t')

##### Formatting the Datasets

In [23]:
# Formatting the data as per the requirments
formatted_training_data = shrinkColumns(training_data)
formatted_test_data = shrinkColumns(test_data)

##### Question Pre-processing

In [24]:
# Getting the question list from the main dataframe
question_list = formatted_training_data["Question"]

# Converting the question list to torch vectors
question_vector_torch, MAX_LEN_Q = questionPreprocess(question_list)

##### Document Pre-processing

In [25]:
# Getting the document list from the main dataframe
document_list = formatted_training_data["Document"]

# document embedding - option values
# 1 - only word embedding
# 2 - word embedding and pos tagging
# 3 - word embedding, pos tagging, NER tagging and TF-IDF
option = 3
padded_document_tokens, document_vector_torch, MAX_LEN_D = documentPreprocessing(document_list, option)

##### Label Pre-processing

In [26]:
# Getting the list of answers from the dataframe
answer_list = formatted_training_data["Answer"]
answer_tokens = tokenizeList(answer_list)

# Generating Labels
document_labels = list()
for ind in range(len(padded_document_tokens)):
    len_org_document = len(padded_document_tokens[ind])
    document_labels.append(generateLabels(padded_document_tokens[ind], answer_tokens[ind],len_org_document))

# Word to vector
embedded_document_labels = word2VecDocuments(document_labels)

# To torch vectors
label_vector_torch = torch.from_numpy(np.array(embedded_document_labels)).float().to(device)


# 2.QA Model Implementation

### Question Summary Model

In [27]:
class question_Summary(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(question_Summary, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Bi-directional RNN
        self.rnn = nn.RNN(self.input_size, self.hidden_size, batch_first =True, bidirectional=True)

    def forward(self, input):     
        _, h_n = self.rnn(input)
        # concat the last hidden state from two direction
        hidden_out = torch.cat((h_n[0:],h_n[1:]),0)
        return hidden_out

### Document Attention Model

In [28]:
class doc_Attention(nn.Module):
    ATTN_TYPE_DOT_PRODUCT = "Dot Product"
    ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product" 
    ATTN_TYPE_COSINE_SIMILARITY = "Cosine Similarity"

    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(doc_Attention, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, 2*self.hidden_size)
        self.out = nn.Linear(self.hidden_size*4, self.output_size)


    def cal_attention(self, hidden, question_summary, method):
        # Dot Product Attention
        if method == doc_Attention.ATTN_TYPE_DOT_PRODUCT:
            attn_weights = F.softmax(torch.bmm(hidden, question_summary.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, question_summary.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)
        # Scaled Dot Product Attention
        elif method == doc_Attention.ATTN_TYPE_SCALE_DOT_PRODUCT:
            attn_weights = F.softmax(torch.bmm(hidden, question_summary.T.unsqueeze(0)), dim=-1)
            attn_output = torch.bmm(attn_weights, question_summary.unsqueeze(0))
            concat_output = torch.cat((attn_output[0].detach(), hidden[0].detach()), 1) / np.sqrt(hidden_size)
        # Cosine Similarity Attention
        elif method == doc_Attention.ATTN_TYPE_COSINE_SIMILARITY:
            attn_weights = F.softmax(torch.bmm(hidden, question_summary.T.unsqueeze(0)), dim=-1)
            attn_output = torch.bmm(attn_weights, question_summary.unsqueeze(0))
            concat_output = torch.cat((attn_output[0].detach(), hidden[0].detach()), 1) / (norm(attn_output[0].detach())*norm(hidden[0].detach()))


        return concat_output

    def forward(self, input, hidden, question_summary, method ):
        _, hidden = self.gru(input, hidden)

        concat_output = self.cal_attention(hidden, question_summary, method)

        output = F.softmax(self.out(concat_output), dim=1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(MAX_LEN_D, 1, self.hidden_size, device=device)

#### Train Function

In [34]:
def train(input_question_tensor, input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, method = doc_Attention.ATTN_TYPE_DOT_PRODUCT):
    
    input_length = input_question_tensor.size(0)
    target_length = target_tensor.size(0)

    hidden_size = input_tensor.shape[2]

    question_summary = torch.zeros(MAX_LEN_Q, hidden_size*2, device=device)

    loss = 0    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    for i in range(input_length):
        encoder_hidden = encoder(input_question_tensor[i])
        question_summary[i] = encoder_hidden[0,0]

    # it is for storing the hidden states of input sequence later, which will be used for calculating the attention during the decoding process
    decoder_hidden = torch.zeros(1, MAX_LEN_D, hidden_size*2, device=device)

    decoder_input = input_tensor

    # Teacher forcing: Feed the target as the next input
    for i in range(target_length):
        decoder_output, _ = decoder(decoder_input, decoder_hidden, question_summary, method)
        target = target_tensor[i]
        for j in range(len(target_tensor[i])):
            loss += criterion(decoder_output[j], target[j]) 

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [30]:
import random
def trainIters(encoder, decoder, input_tensor, question_tensor, label_tensor, n_iters, method , print_every=200, plot_every=200, learning_rate=0.002):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss()

    input_question_tensor = question_tensor
    input_tensor = input_tensor
    target_tensor = label_tensor

    for iter in range(1, n_iters + 1):
        loss = train(input_question_tensor, input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, method)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

##### Parameter Definitions

In [31]:
# Attention - option values
# 1 - Dot Product
# 2 - Scalar Dot Product
# 3 - Scalar Dot Product
attention_option = 1

if(attention_option == 1):
    attn_method = doc_Attention.ATTN_TYPE_DOT_PRODUCT
elif(attention_option == 2):
    attn_method = doc_Attention.ATTN_TYPE_SCALE_DOT_PRODUCT
elif(attention_option == 3):
    attn_method = doc_Attention.ATTN_TYPE_COSINE_SIMILARITY



In [36]:
input_tensor = document_vector_torch
question_tensor = question_vector_torch
label_tensor = label_vector_torch

hidden_size = input_tensor.shape[2]
learning_rate = 1e-3
input_size = question_tensor.shape[2]
output_size = question_tensor.shape[2]
n_iterations = 10

In [37]:
quest_summ_model = question_Summary(input_size, hidden_size).to(device)
doc_attn_model = doc_Attention(hidden_size, output_size, dropout_p=0.1).to(device)

trainIters(quest_summ_model, doc_attn_model, input_tensor, question_tensor, label_tensor, n_iterations, attn_method, print_every=2)

0m 6s (- 0m 27s) (2 20%) -89.4510
0m 13s (- 0m 20s) (4 40%) -90.8810
0m 20s (- 0m 13s) (6 60%) -107.9134
0m 27s (- 0m 6s) (8 80%) -115.3991
0m 33s (- 0m 0s) (10 100%) -115.4147


In [38]:
torch.save(quest_summ_model, 'question_summary_model.pt')
torch.save(doc_attn_model, 'document_attention_model.pt')

# 3.Model Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

#### Functions for Testing

##### Function to Evaluate the Models

In [40]:
def evaluate(encoder, decoder, questions, documents_torch, MAX_LEN_Q, MAX_LEN_D, attn_option):
    '''Function to evaluate the models'''
    encoder.eval() # Turn on the evaluation mode
    decoder.eval() # Turn on the evaluation mode

    with torch.no_grad():
        if(attn_option == 1):
            attn_method = doc_Attention.ATTN_TYPE_DOT_PRODUCT
        elif(attn_option == 2):
            attn_method = doc_Attention.ATTN_TYPE_SCALE_DOT_PRODUCT
        elif(attn_option == 3):
            attn_method = doc_Attention.ATTN_TYPE_COSINE_SIMILARITY


        input_length = MAX_LEN_Q

        question_summary = torch.zeros(MAX_LEN_Q, hidden_size*2, device=device)

        decoder_hidden = torch.zeros(1, MAX_LEN_D, hidden_size*2, device=device)

        for i in range(input_length):
            encoder_hidden = encoder(questions[i])
            question_summary[i] = encoder_hidden[0,0]

        decoder_input = documents_torch

        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, question_summary, attn_method)
        topv, topi = decoder_output.data.topk(1) # simply adopt the predicted tag with the highest probabiity

        return topi

##### Helper function to get the sentences with Predicted words

In [41]:
def getSentances(documents, words):
    '''Function to predict the sentances based on the predicted words'''
    sentences_with_word = []

    tokenizer = nltk.sent_tokenize
    
    for ind in range(len(documents)):
        sentences = tokenizer(documents[ind])
        word = words[ind]
        
        for sentence in sentences:
            if word in sentence:
                sentences_with_word.append(sentence)
    
    return sentences_with_word

##### Function to Predict the sentances

In [42]:
def prediction(pred_ind, doc_list, ans_list):
    '''Function to predict the sentances from the main document list'''

    predicted_words = [] 

    doc_tokens = tokenizeList(doc_list)
    for i in range(len(doc_tokens)):
        document = doc_tokens[i]
        if(len(document) > pred_ind[i]):
            predicted_words.append(document[pred_ind[i].item()])
        else:
            predicted_words.append("")


    predicted_sentances = getSentances(doc_list, predicted_words)
    
    target_sentences = ans_list

    total_sentences = len(doc_list)
    correct_sentences = [""]*total_sentences

    for ind in range(len(target_sentences)):
        for j in range(len(predicted_sentances)):
            if target_sentences[ind] == predicted_sentances[j]:
                correct_sentences[ind] = predicted_sentances[j]

    #accuracy = correct_sentences / total_sentences

    return target_sentences, correct_sentences

##### Function to Print the accuracy Scores

In [43]:
def printScores(report, decimal_places = 2):
    '''Function to print the scores from the classification report'''

    accuracy = round(report['accuracy'], decimal_places)
    macro_avg_precision = round(report['macro avg']['precision'], decimal_places)
    macro_avg_recall = round(report['macro avg']['recall'], decimal_places)
    macro_avg_f1_score = round(report['macro avg']['f1-score'], decimal_places)
    weighted_avg_precision = round(report['weighted avg']['precision'], decimal_places)
    weighted_avg_recall = round(report['weighted avg']['recall'], decimal_places)
    weighted_avg_f1_score = round(report['weighted avg']['f1-score'], decimal_places)

    print(f"Accuracy: \t\t\t{accuracy}")
    print(f"Macro Average Precision: \t{macro_avg_precision}")
    print(f"Macro Average Recall: \t\t{macro_avg_recall}")
    print(f"Macro Average F1-score: \t{macro_avg_f1_score}")
    print(f"Weighted Average Precision: \t{weighted_avg_precision}")
    print(f"Weighted Average Recall: \t{weighted_avg_recall}")
    print(f"Weighted Average F1-score: \t{weighted_avg_f1_score}")

#### Pre-processing the test data

In [44]:
# Getting the question list from the main dataframe
test_question_list = formatted_test_data["Question"]

# Converting the question list to torch vectors
test_question_vector_torch, T_MAX_LEN_Q = questionPreprocess(test_question_list)

In [45]:
# Getting the document list from the main dataframe
test_document_list = formatted_test_data["Document"]

option = 3
_, test_document_vector_torch, T_MAX_LEN_D = documentPreprocessing(test_document_list, option)

# Getting the list of answers from the dataframe
test_answer_list = formatted_test_data["Answer"]

#### Model Evaluation

In [49]:
# Evaluating the Model

predicted_indexes = evaluate(quest_summ_model, doc_attn_model, test_question_vector_torch, test_document_vector_torch, T_MAX_LEN_Q, T_MAX_LEN_D, 1)
target_sentences, predicted_sentences = prediction(predicted_indexes, test_document_list, test_answer_list)
report = classification_report(target_sentences, predicted_sentences, output_dict=True, zero_division=1)

printScores(report, decimal_places = 2)

Accuracy: 			0.77
Macro Average Precision: 	1.0
Macro Average Recall: 		0.4
Macro Average F1-score: 	0.4
Weighted Average Precision: 	0.83
Weighted Average Recall: 	0.77
Weighted Average F1-score: 	0.67


###3.1. Input Embedding Ablation Study(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [None]:
embedding =    ["Word Embedding",
                "Word Embedding + POS Tags",
                "Word Embedding + POS Tags + NER Tags + TF-IDF"]

for embedding_option in range(1,4):

    #Training
    _, document_vector_torch, MAX_LEN_D = documentPreprocessing(document_list, embedding_option)

    input_tensor = document_vector_torch
    question_tensor = question_vector_torch
    label_tensor = label_vector_torch

    hidden_size = input_tensor.shape[2]
    input_size = question_tensor.shape[2]
    output_size = question_tensor.shape[2]
    n_iterations = 10

    quest_summ_model = question_Summary(input_size, hidden_size).to(device)
    doc_attn_model = doc_Attention(hidden_size, output_size, dropout_p=0.1).to(device)

    trainIters(quest_summ_model, doc_attn_model, input_tensor, question_tensor, label_tensor, n_iterations, 1)
    

    #Testing
    _, test_document_vector_torch, T_MAX_LEN_D = documentPreprocessing(test_document_list, embedding_option)

    predicted_indexes = evaluate(quest_summ_model, doc_attn_model, test_question_vector_torch, test_document_vector_torch, T_MAX_LEN_Q, T_MAX_LEN_D, 1)
    target_sentences, predicted_sentences = prediction(predicted_indexes, test_document_list, test_answer_list)
    report = classification_report(target_sentences, predicted_sentences, output_dict=True, zero_division=1)

    print(embedding[embedding_option - 1])
    print("-------------------------------")
    printScores(report, decimal_places = 2)
    print()


Word Embedding
-------------------------------
Accuracy: 			0.8
Macro Average Precision: 	0.97
Macro Average Recall: 		0.5
Macro Average F1-score: 	0.48
Weighted Average Precision: 	0.85
Weighted Average Recall: 	0.8
Weighted Average F1-score: 	0.71

Word Embedding + POS Tags
-------------------------------
Accuracy: 			0.85
Macro Average Precision: 	0.98
Macro Average Recall: 		0.62
Macro Average F1-score: 	0.61
Weighted Average Precision: 	0.88
Weighted Average Recall: 	0.85
Weighted Average F1-score: 	0.78

Word Embedding + POS Tags + NER Tags + TF-IDF
-------------------------------
Accuracy: 			0.75
Macro Average Precision: 	0.97
Macro Average Recall: 		0.38
Macro Average F1-score: 	0.35
Weighted Average Precision: 	0.82
Weighted Average Recall: 	0.75
Weighted Average F1-score: 	0.65



###3.2. Attention Ablation Study
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [None]:
attention =["1 - Dot Product", 
            "2 - Scalar Dot Product",
            "3 - Cosine Similarity"]
for attention_option in range(1,4):

    if(attention_option == 1):
        attention_method = doc_Attention.ATTN_TYPE_DOT_PRODUCT
    elif(attention_option == 2):
        attention_method = doc_Attention.ATTN_TYPE_SCALE_DOT_PRODUCT
    elif(attention_option == 3):
        attention_method = doc_Attention.ATTN_TYPE_COSINE_SIMILARITY

    input_tensor = document_vector_torch
    question_tensor = question_vector_torch
    label_tensor = label_vector_torch

    hidden_size = input_tensor.shape[2]
    input_size = question_tensor.shape[2]
    output_size = question_tensor.shape[2]
    n_iterations = 10

    quest_summ_model = question_Summary(input_size, hidden_size).to(device)
    doc_attn_model = doc_Attention(hidden_size, output_size, dropout_p=0.1).to(device)

    trainIters(quest_summ_model, doc_attn_model, input_tensor, question_tensor, label_tensor, n_iterations, attention_method)

    
    predicted_indexes = evaluate(quest_summ_model, doc_attn_model, test_question_vector_torch, test_document_vector_torch, T_MAX_LEN_Q, T_MAX_LEN_D, attention_option)
    target_sentences, predicted_sentences = prediction(predicted_indexes, test_document_list, test_answer_list)
    report = classification_report(target_sentences, predicted_sentences, output_dict=True, zero_division=1)

    print(attention[attention_option - 1])
    print("-------------------------------")
    printScores(report, decimal_places = 2)
    print()

1 - Dot Product
-------------------------------
Accuracy: 			0.8
Macro Average Precision: 	0.97
Macro Average Recall: 		0.5
Macro Average F1-score: 	0.48
Weighted Average Precision: 	0.85
Weighted Average Recall: 	0.8
Weighted Average F1-score: 	0.71

2 - Scalar Dot Product
-------------------------------
Accuracy: 			0.75
Macro Average Precision: 	0.97
Macro Average Recall: 		0.38
Macro Average F1-score: 	0.35
Weighted Average Precision: 	0.82
Weighted Average Recall: 	0.75
Weighted Average F1-score: 	0.65

3 - Cosine Similarity
-------------------------------
Accuracy: 			0.8
Macro Average Precision: 	0.97
Macro Average Recall: 		0.5
Macro Average F1-score: 	0.48
Weighted Average Precision: 	0.85
Weighted Average Recall: 	0.8
Weighted Average F1-score: 	0.71



###3.3. Hyper Parameter Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [None]:
learning_rates =[1e-2, 1e-3, 2e-3, 1e-4, 1e-5]
for learning_rate in learning_rates:

    input_tensor = document_vector_torch
    question_tensor = question_vector_torch
    label_tensor = label_vector_torch

    hidden_size = input_tensor.shape[2]
    input_size = question_tensor.shape[2]
    output_size = question_tensor.shape[2]
    n_iterations = 10

    quest_summ_model = question_Summary(input_size, hidden_size).to(device)
    doc_attn_model = doc_Attention(hidden_size, output_size, dropout_p=0.1).to(device)

    trainIters(quest_summ_model, doc_attn_model, input_tensor, question_tensor, label_tensor, n_iterations,learning_rate = learning_rate, method = doc_Attention.ATTN_TYPE_DOT_PRODUCT)

    
    predicted_indexes = evaluate(quest_summ_model, doc_attn_model, test_question_vector_torch, test_document_vector_torch, T_MAX_LEN_Q, T_MAX_LEN_D, attention_option)
    target_sentences, predicted_sentences = prediction(predicted_indexes, test_document_list, test_answer_list)
    report = classification_report(target_sentences, predicted_sentences, output_dict=True, zero_division=1)

    print("Learning Rate = ",learning_rate)
    print("-------------------------------")
    printScores(report, decimal_places = 2)
    print()

Learning Rate =  0.01
-------------------------------
Accuracy: 			0.75
Macro Average Precision: 	0.97
Macro Average Recall: 		0.38
Macro Average F1-score: 	0.35
Weighted Average Precision: 	0.82
Weighted Average Recall: 	0.75
Weighted Average F1-score: 	0.65

Learning Rate =  0.001
-------------------------------
Accuracy: 			0.75
Macro Average Precision: 	0.97
Macro Average Recall: 		0.38
Macro Average F1-score: 	0.35
Weighted Average Precision: 	0.82
Weighted Average Recall: 	0.75
Weighted Average F1-score: 	0.65

Learning Rate =  0.002
-------------------------------
Accuracy: 			0.75
Macro Average Precision: 	0.97
Macro Average Recall: 		0.38
Macro Average F1-score: 	0.35
Weighted Average Precision: 	0.82
Weighted Average Recall: 	0.75
Weighted Average F1-score: 	0.65

Learning Rate =  0.0001
-------------------------------
Accuracy: 			0.8
Macro Average Precision: 	0.97
Macro Average Recall: 		0.5
Macro Average F1-score: 	0.48
Weighted Average Precision: 	0.85
Weighted Average Re