# 2023 CITS4012 Assignment
*Make sure you change the file name with your student id.*

# Readme
*If there is something to be noted for the marker, please mention here.* 

*If you are planning to implement a program with Object Oriented Programming style, please check the bottom of the this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [1]:
# Necessary Library Imports
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk import word_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OzLaptops\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Read in data files
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")

def wrangle_data(df):
    """Handles data and splits dataframe into question, document and answer set

    df: dataframe to split

    return: question, document, and answer dataframe
    """

    # Initialise variables containing all Questions, Documents and Answers
    questions = []
    documents = []
    answers = {}

    # Get unique questions from the dataframe
    unique_questions = df['Question'].drop_duplicates()

    # Iterate through questions dataframe to form the Questions data
    for row in unique_questions:
        
        # Tokenise the question
        question_tokens = word_tokenize(row.lower())

        # Append the question to the list of questions
        questions.append(question_tokens)

    # Iterate through dataframe and form Documents and Answer data
    for index, row in df.iterrows():

        # Initalise current document pairs list and previous doc number (only for first iteration)
        if index == 0:
            previous_doc_num = row['DocumentID']
            current_doc_pairs = []

        # If the current doc number is different to the previous doc number, then a new  
        # document has been found so append the current document pairs to the list of documents     
        # and reinitilise the current document pairs list
        current_doc_num = row['DocumentID']
        if current_doc_num != previous_doc_num:
            documents.append(current_doc_pairs)
            current_doc_pairs = []
        
        # Tokenise the current row's sentence
        sent_tokens = word_tokenize(row['Sentence'].lower())

        # Assign token types according to 'Label' field (1 means the corresponding sentence
        # is the answer/one of the answers)
        # Token types:
        #   0: token a part of the answer
        #   1: start token of the answer
        #   2: inner token of the answer
        #   3: end token of the answer
        if row['Label']:
            # Answer to question is found
            token_types = np.full(shape=len(sent_tokens), fill_value=2)
            token_types[0] = 1
            token_types[-1] = 3

            # Add the tokenised answer to the answers dictionary. If an answer has already been found.
            # add the tokens to the current answer.
            if row['QuestionID'] in answers:
                answers[row['QuestionID']] += sent_tokens 
            else:
                answers[row['QuestionID']] = sent_tokens
        else:
            # Answer not found 
            token_types = np.zeros(len(sent_tokens), dtype=int)

        # Combine token and corresponding types and append to current document list
        token_type_pairs = list(zip(sent_tokens, token_types))
        current_doc_pairs.append(token_type_pairs)

        # Update previous document number 
        previous_doc_num = current_doc_num

        # If the iterated row is the final row in the dataframe, append the current document
        # to the documents array
        if index == len(df)-1:
            documents.append(current_doc_pairs)

    return questions, documents, answers

train_questions, train_documents, train_answers = wrangle_data(train_data)
test_questions, test_documents, test_answers = wrangle_data(test_data)


In [None]:
train_questions

In [None]:
train_answers

In [None]:
train_documents


In [None]:
# for debugging - find all labels equal to 1
train_data.loc[train_data['Label'] == 1]

# 2.QA Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

**GloVe/Word2VeC word embedding**

In [None]:
import gensim.downloader as api

# model = api.load("word2vec-google-news-300")
model = api.load("glove-wiki-gigaword-50")


In [6]:
model['?']

array([-1.4578e-01,  5.0459e-01,  4.7525e-02, -4.6463e-01,  4.4249e-01,
       -1.6772e-01, -4.0334e-01, -3.9223e-01, -4.1543e-01,  2.7637e-01,
       -6.3027e-01,  6.9033e-01, -4.5441e-01,  1.5845e-03,  1.3120e+00,
        5.2413e-01,  3.7380e-01,  2.8156e-01, -4.0563e-03, -5.2664e-01,
       -5.7061e-01,  3.6561e-01,  5.9174e-01,  3.4713e-01,  4.5009e-01,
       -2.1454e+00, -1.3795e+00,  3.0700e-01,  1.4876e+00, -9.6313e-01,
        2.8403e+00,  5.0247e-01, -8.6752e-01,  6.4130e-02, -3.6376e-01,
       -1.4019e-01,  1.1975e-01, -4.5442e-02,  7.2682e-01, -4.4447e-01,
       -2.7226e-01,  1.5030e-01,  1.1489e-01,  7.1237e-01,  1.1341e-01,
        2.2835e-01, -4.0801e-02, -4.1468e-01,  1.1054e-01,  1.1681e+00],
      dtype=float32)

**TFIDF (Term Frequency Inverse Document Frequency)**

In [9]:
def calculate_df(tokenized_data):
    """ Calculate term frequencies for a given list of list of tokens

    df: tokenized dataset

    return: dictionary of frequencies for each term in the dataset
    """
    DF = {}

    for token_vector in tokenized_data:
        # get each unique word in the doc and count the number of occurences in the document
        for term in np.unique(token_vector):
            try:
                DF[term] += 1
            except:
                DF[term] = 1

    # print scores in descending order
    sorted_dict = sorted(DF.items(), key = lambda x : x[1], reverse = True)
    #print(sorted_dict)
    
    return DF

In [10]:
# calculate TFIDF scores

from collections import Counter
import math

def calculate_tf_idf(tokenized_data):

    tf_idf = {}

    # total number of documents
    n = len(tokenized_data)

    # calculate Document Frequencies
    DF = calculate_df(tokenized_data)

    doc_id = 0
    # get each token vector
    for token_vector in tokenized_data:
        # initialise counter for the vector
        counter = Counter(token_vector)
        #calculate total number of words in the doc
        total_num_words = len(token_vector)

        # get each unique word in the doc
        for term in np.unique(token_vector):

            # calculate Term Frequency
            tf = counter[term]/total_num_words

            # calculate Document Frequency
            df = DF[term]

            # calculate Inverse Document Frequency
            idf = math.log(n/(df+1))+1

            # calcaulte TF-IDF
            tf_idf[doc_id, term] = tf*idf

        doc_id += 1

    return tf_idf

calculate_tf_idf(train_questions)


{(0, '?'): 0.48848458725794464,
 (0, 'are'): 0.5366126603686117,
 (0, 'caves'): 1.3274346817624867,
 (0, 'formed'): 1.096385621575838,
 (0, 'glacier'): 1.3274346817624867,
 (0, 'how'): 0.44272339886228584,
 (1, 'a'): 0.21464506414744466,
 (1, 'and'): 0.29300502627290326,
 (1, 'are'): 0.21464506414744466,
 (1, 'circular'): 0.5309738727049946,
 (1, 'directions'): 0.5309738727049946,
 (1, 'force'): 0.4474563414719701,
 (1, 'how'): 0.17708935954491434,
 (1, 'in'): 0.17529845006027778,
 (1, 'motion'): 0.484764060667665,
 (1, 'of'): 0.18878367758362766,
 (1, 'related'): 0.484764060667665,
 (1, 'the'): 0.2693996319800544,
 (1, 'vectors'): 0.5309738727049946,
 (1, 'velocity'): 0.5309738727049946,
 (2, 'apollo'): 1.5929216181149841,
 (2, 'creed'): 1.5929216181149841,
 (2, 'did'): 0.6985938594422703,
 (2, 'die'): 1.11334256355531,
 (2, 'how'): 0.531268078634743,
 (3, 'federal'): 0.838980640259944,
 (3, 'for'): 0.484828972252917,
 (3, 'how'): 0.3320425491467144,
 (3, 'is'): 0.24493590212743152,
 

**POS Tagging**

In [None]:
from nltk.tag import pos_tag_sents
nltk.download('averaged_perceptron_tagger')
pos_tag_sents(train_questions)

# 3.Model Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

In [None]:
#
#

###3.1. Input Embedding Ablation Study

(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 



###3.2. Attention Ablation Study
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

###3.3. Hyper Parameter Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 