In [1]:
import json

with open('val_questions.json', 'r') as fp:
    val_questions = json.load(fp)
    
with open('val_answers.json', 'r') as fp:
    val_answers = json.load(fp)

In [23]:
import string
import pprint
import random
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
import heapq
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

def get_article_text(url):
    # Scrape article using bs4 to extract all paragraphs from the online article.
    raw_html = urllib.request.urlopen(url)
    raw_html = raw_html.read()

    article_html = BeautifulSoup(raw_html, 'lxml')
    article_paragraphs = article_html.find_all('p')

    # Creating a document 'article_text' containing all the sentences in the article.
    article_text = ''
    for para in article_paragraphs:
        article_text += para.text
    return article_text

def remove_stopwords(sentence):
    filtered_sentence = []
    stop_words = nltk.corpus.stopwords.words('english')
    word_tokens = nltk.word_tokenize(sentence)
    for token in word_tokens:
        if token not in stop_words:
            filtered_sentence.append(token)
    filtered_sentence = ' '.join(filtered_sentence)
    return filtered_sentence

def clean_sentence(sentence):
    sentence = sentence.lower()
    sentence = remove_stopwords(sentence)
    sentence = re.sub(r'\W', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

def clean_article_text(article_text):
    # Creating a corpus containing all the sentence tokens in the document.
    corpus = nltk.sent_tokenize(article_text)
    # Convert to lowercase, remove non-word characters (punctuations, etc.) and strip whitespaces
    for i in range(len(corpus)):
        corpus[i] = clean_sentence(corpus[i])
    return corpus

def create_word_freq_dictionary(corpus):
    # Create dictionary with word frequency
    word_freq = defaultdict(int)
    for sentence in corpus:
        word_tokens = nltk.word_tokenize(sentence)
        for token in word_tokens:
            word_freq[token] += 1
    return word_freq

def generate_sent_vec(sentence, most_freq_tokens):
    word_tokens = nltk.word_tokenize(sentence)
    sent_vec = []
    for token in most_freq_tokens:
        if token in word_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    return sent_vec

def get_sentence_vectors(corpus, most_freq_tokens):
    # Generate sentence vectors of 1's and 0's. Feature set is the most_freq_tokens list.
    sentence_vectors = []
    for sentence in corpus:
        sent_vec = generate_sent_vec(sentence, most_freq_tokens)
        sentence_vectors.append(sent_vec)
        
    sentence_vectors = np.asarray(sentence_vectors)
    return sentence_vectors

def get_answer(url, question):

    article_text = get_article_text(url)
    #print("Article Text: \n", article_text)
    initial_corpus = nltk.sent_tokenize(article_text)
    corpus = clean_article_text(article_text)

    word_freq = create_word_freq_dictionary(corpus)

    # Get the most frequent tokens from the dictionary
    most_freq_tokens = heapq.nlargest(200, word_freq, key=word_freq.get)

    sentence_vectors = get_sentence_vectors(corpus, most_freq_tokens)

    cleaned_question = clean_sentence(question)
    question_vector = generate_sent_vec(cleaned_question, most_freq_tokens)

    similarity_scores = []
    sent_vec_index = 0
    for sent_vec in sentence_vectors:
        similarity = 1 - sp.spatial.distance.cosine(question_vector, sent_vec)
        similarity_scores.append((sent_vec_index, similarity))
        sent_vec_index += 1
    similarity_scores.sort(key = lambda x: x[1], reverse=True)
    answer_index = similarity_scores[0][0]

    return initial_corpus[answer_index]


#### Model Evaluation

In [8]:
url = 'https://en.wikipedia.org/wiki/India'

article_text = get_article_text(url)
#print("Article Text: \n", article_text)

# Maintaining initial corpus for displaying answers:
initial_corpus = nltk.sent_tokenize(article_text)
# Clean the article sentence to remove extra whitespaces and reference numbers (such as "[23]")
for i in range(len(initial_corpus)):
    initial_corpus[i] = re.sub(r'\[\d+\]', '', initial_corpus[i])
    initial_corpus[i] = re.sub(r'\[\d+,\s\d+]', '', initial_corpus[i])
    initial_corpus[i] = re.sub(r'\[\w\]', '', initial_corpus[i])
    initial_corpus[i] = re.sub(r'\s+', ' ', initial_corpus[i]).strip()

# Generating a clean corpus to be fed to the model.
corpus = clean_article_text(article_text)

word_freq = create_word_freq_dictionary(corpus)

# Get the most frequent tokens from the dictionary
most_freq_tokens = heapq.nlargest(200, word_freq, key=word_freq.get)

sentence_vectors = get_sentence_vectors(corpus, most_freq_tokens)

In [25]:
print('\nSentence vector: {}'.format(sentence_vectors[4]))
print('\nVal questions:\n{}'.format(list(val_questions.items())[:3]))
print('\nVal answers:\n{}'.format(list(val_answers.items())[:3]))


Sentence vector: [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Val questions:
[('0', 'What is India?'), ('1', 'When did modern humans arrive on the Indian subcontinent?'), ('10', 'When did the name Bharat gain increased currency as a native name for India?')]

Val answers:
[('0', 'What is India?'), ('1', 'When did modern humans arrive on the Indian subcontinent?'), ('2', 'Where were the Dravidian languages of India supplanted?')]


In [33]:
question = val_questions['12']

cleaned_question = clean_sentence(question)
question_vector = generate_sent_vec(cleaned_question, most_freq_tokens)

similarity_scores = []
sent_vec_index = 0
for sent_vec in sentence_vectors:
    similarity = 1 - sp.spatial.distance.cosine(question_vector, sent_vec)
    similarity_scores.append((sent_vec_index, similarity))
    sent_vec_index += 1
similarity_scores.sort(key = lambda x: x[1], reverse=True)
answer_index = similarity_scores[0][0]

initial_corpus[answer_index]

  dist = 1.0 - uv / np.sqrt(uu * vv)


'[75]\nDuring the period 2000–500 BCE, many regions of the subcontinent transitioned from the Chalcolithic cultures to the Iron Age ones.'

In [30]:
for qid, question in val_questions.items():
    

0   What is India?
1   When did modern humans arrive on the Indian subcontinent?
10   When did the name Bharat gain increased currency as a native name for India?
11   How many years ago does the earliest known modern human remain in India?
12   When did the regions of the Indian subcontinent transitioned from the Chalcolithic cultures to the Iron Age ones?
13   What does archaeological evidence suggest about the Deccan Plateau?
14   When did Jainism come into prominence?
15   How much area did the empire control in the subcontinent and what were its core regions? 
16   Which empire had created a complex system of administration and taxation in the greater Ganges Plain which became a model for later Indian kingdoms?
17   What defines the Indian early medieval age, 600 CE to 1200 CE?
18   Could any ruler of this period control lands beyond their core region?
19   What led to both the resurgence of Hinduism and the development of modern languages in the subcontinent?
2   Where were the D

In [None]:
cleaned_question = clean_sentence(question)
question_vector = generate_sent_vec(cleaned_question, most_freq_tokens)

similarity_scores = []
sent_vec_index = 0
for sent_vec in sentence_vectors:
    similarity = 1 - sp.spatial.distance.cosine(question_vector, sent_vec)
    similarity_scores.append((sent_vec_index, similarity))
    sent_vec_index += 1
similarity_scores.sort(key = lambda x: x[1], reverse=True)
answer_index = similarity_scores[0][0]

pred_ans =  initial_corpus[answer_index]