In [1]:
import string
import random
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
import heapq
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def get_article_text(url):
    # Scrape article using bs4 to extract all paragraphs from the online article.
    raw_html = urllib.request.urlopen(url)
    raw_html = raw_html.read()

    article_html = BeautifulSoup(raw_html, 'lxml')
    article_paragraphs = article_html.find_all('p')

    # Creating a document 'article_text' containing all the sentences in the article.
    article_text = ''
    for para in article_paragraphs:
        article_text += para.text
    return article_text

In [3]:
article_text = get_article_text('https://en.wikipedia.org/wiki/India')

In [4]:
def clean_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\W', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [5]:
def clean_article_text(article_text):
    # Creating a corpus containing all the sentence tokens in the document.
    corpus = nltk.sent_tokenize(article_text)
    # Convert to lowercase, remove non-word characters (punctuations, etc.) and strip whitespaces
    for i in range(len(corpus)):
        corpus[i] = clean_sentence(corpus[i])
    return corpus

In [6]:
article_sentences = nltk.sent_tokenize(article_text)

# Tokenize article text into sentences.
article_sentences = nltk.sent_tokenize(article_text)

# Clean the article sentence to remove extra whitespaces and reference numbers (such as "[23]")
for i in range(len(article_sentences)):
    article_sentences[i] = re.sub(r'\[\d+\]', '', article_sentences[i])
    article_sentences[i] = re.sub(r'\[\d+,\s\d+]', '', article_sentences[i])
    article_sentences[i] = re.sub(r'\[\w\]', '', article_sentences[i])
    article_sentences[i] = re.sub(r'\s+', ' ', article_sentences[i]).strip()

In [42]:
val_article_sentences = article_sentences[::4][:-11]
len(val_article_sentences)

100

In [43]:
val_answers = dict()
val_questions = dict()

In [44]:
for i, sent in enumerate(val_article_sentences):
    val_answers[i] = sent
    print('\nSentence {}: {}'.format(i, sent))
    ques = input('Enter question: ')
    val_questions[i] = ques


Sentence 0: India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya), is a country in South Asia.
Enter question: What is India?

Sentence 1: Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.
Enter question: When did modern humans arrive on the Indian subcontinent?

Sentence 2: The Dravidian languages of India were supplanted in the northern and western regions.
Enter question: Where were the Dravidian languages of India supplanted?

Sentence 3: In South India, the Middle kingdoms exported Dravidian-languages scripts and religious cultures to the kingdoms of Southeast Asia.
Enter question: Where were the Dravidian-languages scripts and religious cultures exported?

Sentence 4: In the Punjab, Sikhism emerged, rejecting institutionalised religion.
Enter question: What emerged in Punjab which rejected instituionalized religion?

Sentence 5: The rights promised to Indians were granted slowly, but technological changes wer

Enter question: When did India's geological processes begin that began a north-eastward drift caused by seafloor spread?

Sentence 33: Cut off from the plain by the ancient Aravalli Range lies the Thar Desert.
Enter question: Where does the Thar Desert lie?

Sentence 34: To the south, the remaining peninsular landmass, the Deccan Plateau, is flanked on the west and east by coastal ranges known as the Western and Eastern Ghats; the plateau contains the country's oldest rock formations, some over one billion years old.
Enter question: To the south, which plateau contains the country's oldest rock formations?

Sentence 35: Major Himalayan-origin rivers that substantially flow through India include the Ganges and the Brahmaputra, both of which drain into the Bay of Bengal.
Enter question: The Himalayan-origin rivers including the Ganges and Brahmaputra drain into which water body?

Sentence 36: India has two archipelagos: the Lakshadweep, coral atolls off India's south-western coast; and t

Enter question: Between 2001 and 2011, the contribution of petrochemical and engineering goods to total exports grew by what percent?

Sentence 65: Though ranking 51st in global competitiveness, as of 2010[update], India ranks 17th in financial market sophistication, 24th in the banking sector, 44th in business sophistication, and 39th in innovation, ahead of several advanced economies.
Enter question: What is the ranking of India in global competitiveness, financial market sophistication, banking sector, business sophistication and innovation?

Sentence 66: It is expected to grow to US$2,358 by 2020.
Enter question: By how much is it expected to grow by 2020?

Sentence 67: During the next four decades, Indian GDP is expected to grow at an annualised average of 8%, making it potentially the world's fastest-growing major economy until 2050.
Enter question: During the next four decades, Indian GDP is expected to grow at what annualised average?

Sentence 68: India's telecommunication ind


Sentence 96: Shalwars are atypically wide at the waist but narrow to a cuffed bottom.
Enter question: What are Shalwars?

Sentence 97: The side seams are left open below the waist-line,), which gives the wearer greater freedom of movement.
Enter question: What gives the wearer of this cloth a greater freedom of movement?

Sentence 98: A kurta, which traces its roots to Central Asian nomadic tunics, has evolved stylistically in India as a garment for everyday wear as well as for formal occasions.
Enter question: How has a kurta, which traces its roots to Central Asia, evolved in India and on what occasions is it worn?

Sentence 99: Increasingly, in urban settings in northern India, the sari is no longer the apparel of everyday wear, transformed instead into one for formal occasions.
Enter question: In urban settings in northern India, what is the apparel for formal occasions?


In [46]:
len(val_questions)

100

In [48]:
len(val_answers)

100

In [49]:
import json

with open('val_questions.json', 'w') as fp:
    json.dump(val_questions, fp)
    
with open('val_answers.json', 'w') as fp:
    json.dump(val_questions, fp)

In [37]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    print(pred_tokens)
    print(truth_tokens)
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    print(common_tokens)
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [38]:
pred = 'The ancient Greeks referred to the Indians as Indoi (Ἰνδοί), which translates as "The people of the Indus".'

In [39]:
truth = 'The ancient Greeks referred to the Indians as Indoi (Ἰνδοί), which translates as "The people of the Indus".'

In [40]:
compute_f1(pred, truth)

['ancient', 'greeks', 'referred', 'to', 'indians', 'as', 'indoi', 'ἰνδοί', 'which', 'translates', 'as', 'people', 'of', 'indus']
['ancient', 'greeks', 'referred', 'to', 'indians', 'as', 'indoi', 'ἰνδοί', 'which', 'translates', 'as', 'people', 'of', 'indus']
{'ancient', 'translates', 'which', 'to', 'people', 'ἰνδοί', 'referred', 'as', 'indus', 'indoi', 'of', 'greeks', 'indians'}


0.9285714285714286

In [41]:
compute_exact_match(pred, truth)

1