In [3]:
import json

with open('val_questions.json', 'r') as fp:
    val_questions = json.load(fp)
    
with open('val_answers.json', 'r') as fp:
    val_answers = json.load(fp)

In [4]:
import string
import pprint
import random
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
import heapq
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

def get_article_text(url):
    # Scrape article using bs4 to extract all paragraphs from the online article.
    raw_html = urllib.request.urlopen(url)
    raw_html = raw_html.read()

    article_html = BeautifulSoup(raw_html, 'lxml')
    article_paragraphs = article_html.find_all('p')

    # Creating a document 'article_text' containing all the sentences in the article.
    article_text = ''
    for para in article_paragraphs:
        article_text += para.text
    return article_text

def remove_stopwords(sentence):
    filtered_sentence = []
    stop_words = nltk.corpus.stopwords.words('english')
    word_tokens = nltk.word_tokenize(sentence)
    for token in word_tokens:
        if token not in stop_words:
            filtered_sentence.append(token)
    filtered_sentence = ' '.join(filtered_sentence)
    return filtered_sentence

def clean_sentence(sentence):
    sentence = sentence.lower()
    sentence = remove_stopwords(sentence)
    sentence = re.sub(r'\W', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

def clean_article_text(article_text):
    # Creating a corpus containing all the sentence tokens in the document.
    corpus = nltk.sent_tokenize(article_text)
    # Convert to lowercase, remove non-word characters (punctuations, etc.) and strip whitespaces
    for i in range(len(corpus)):
        corpus[i] = clean_sentence(corpus[i])
    return corpus

def create_word_freq_dictionary(corpus):
    # Create dictionary with word frequency
    word_freq = defaultdict(int)
    for sentence in corpus:
        word_tokens = nltk.word_tokenize(sentence)
        for token in word_tokens:
            word_freq[token] += 1
    return word_freq

def generate_sent_vec(sentence, most_freq_tokens):
    word_tokens = nltk.word_tokenize(sentence)
    sent_vec = []
    for token in most_freq_tokens:
        if token in word_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    return sent_vec

def get_sentence_vectors(corpus, most_freq_tokens):
    # Generate sentence vectors of 1's and 0's. Feature set is the most_freq_tokens list.
    sentence_vectors = []
    for sentence in corpus:
        sent_vec = generate_sent_vec(sentence, most_freq_tokens)
        sentence_vectors.append(sent_vec)
        
    sentence_vectors = np.asarray(sentence_vectors)
    return sentence_vectors

def get_answer(url, question):

    article_text = get_article_text(url)
    #print("Article Text: \n", article_text)
    initial_corpus = nltk.sent_tokenize(article_text)
    corpus = clean_article_text(article_text)

    word_freq = create_word_freq_dictionary(corpus)

    # Get the most frequent tokens from the dictionary
    most_freq_tokens = heapq.nlargest(200, word_freq, key=word_freq.get)

    sentence_vectors = get_sentence_vectors(corpus, most_freq_tokens)

    cleaned_question = clean_sentence(question)
    question_vector = generate_sent_vec(cleaned_question, most_freq_tokens)

    similarity_scores = []
    sent_vec_index = 0
    for sent_vec in sentence_vectors:
        similarity = 1 - sp.spatial.distance.cosine(question_vector, sent_vec)
        similarity_scores.append((sent_vec_index, similarity))
        sent_vec_index += 1
    similarity_scores.sort(key = lambda x: x[1], reverse=True)
    answer_index = similarity_scores[0][0]

    return initial_corpus[answer_index]


### Model Evaluation

#### Generate the word frequency dictionary, most freq tokens and sentence vectors for the given article:

In [5]:
url = 'https://en.wikipedia.org/wiki/India'

article_text = get_article_text(url)
#print("Article Text: \n", article_text)

# Maintaining initial corpus for displaying answers:
initial_corpus = nltk.sent_tokenize(article_text)
# Clean the article sentence to remove extra whitespaces and reference numbers (such as "[23]")
for i in range(len(initial_corpus)):
    initial_corpus[i] = re.sub(r'\[\d+\]', '', initial_corpus[i])
    initial_corpus[i] = re.sub(r'\[\d+,\s\d+]', '', initial_corpus[i])
    initial_corpus[i] = re.sub(r'\[\w\]', '', initial_corpus[i])
    initial_corpus[i] = re.sub(r'\s+', ' ', initial_corpus[i]).strip()

# Generating a clean corpus to be fed to the model.
corpus = clean_article_text(article_text)

word_freq = create_word_freq_dictionary(corpus)

# Get the most frequent tokens from the dictionary
most_freq_tokens = heapq.nlargest(200, word_freq, key=word_freq.get)

sentence_vectors = get_sentence_vectors(corpus, most_freq_tokens)

#### Sentence vectors are 0 and 1 vectors which have the most_freq_tokens as their feature set

In [6]:
print('\nSentence vector: {}'.format(sentence_vectors[4]))
print('\nVal questions:\n{}'.format(list(val_questions.items())[:3]))
print('\nVal answers:\n{}'.format(list(val_answers.items())[:3]))


Sentence vector: [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Val questions:
[('0', 'What is India?'), ('1', 'When did modern humans arrive on the Indian subcontinent?'), ('10', 'When did the name Bharat gain increased currency as a native name for India?')]

Val answers:
[('0', 'India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya), is a country in South Asia.'), ('1', 'Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.'), ('2', 'The Dravidian languages of India were supplanted in the northern and western regions.')]


#### Predict answers on for the val_questions and save the predictions in a JSON file

In [7]:
pred_answers = dict()
for qid, question in val_questions.items():
    cleaned_question = clean_sentence(question)
    question_vector = generate_sent_vec(cleaned_question, most_freq_tokens)

    similarity_scores = []
    sent_vec_index = 0
    for sent_vec in sentence_vectors:
        similarity = 1 - sp.spatial.distance.cosine(question_vector, sent_vec)
        similarity_scores.append((sent_vec_index, similarity))
        sent_vec_index += 1
    similarity_scores.sort(key = lambda x: x[1], reverse=True)
    answer_index = similarity_scores[0][0]

    pred_answers[qid] = initial_corpus[answer_index]

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [42]:
# Saving predictions
with open('bow_pred_answers.json', 'w') as fp:
    json.dump(pred_answers, fp)

#### Functions to compute metrics and evaluate the predictions:

In [9]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    #print(pred_tokens)
    #print(truth_tokens)
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    #print(common_tokens)
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

#### Compute metrics:

In [43]:
f1_scores = []
em_scores = []

for qid, pred_ans in pred_answers.items():
    true_ans = val_answers[qid]
    f1_score = compute_f1(pred_ans, true_ans)
    em_score = compute_exact_match(pred_ans, true_ans)
    
    f1_scores.append(f1_score)
    em_scores.append(em_score)

avg_f1 = sum(f1_scores) / len(f1_scores)
avg_em = sum(em_scores) / len(em_scores)

print('\nAvg F1 Score: {}'.format(avg_f1))
print('\nAvg EM Score: {}'.format(avg_em))


Avg F1 Score: 0.3711199728699776

Avg EM Score: 0.29


#### Sanity testing

In [41]:
qid = '1'

print("\nQuestion: {}".format(val_questions[qid]))
print("\nPred answer: {}".format(pred_answers[qid]))
print("\nTrue answer: {}".format(val_answers[qid]))

em = compute_exact_match(pred_answers[qid], val_answers[qid])
f1 = compute_f1(pred_answers[qid], val_answers[qid])

print("\nEM: {}".format(em))
print("F1: {}".format(f1))


Question: When did modern humans arrive on the Indian subcontinent?

Pred answer: Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.

True answer: Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.

EM: 1
F1: 1.0
