### Modularized code for context generation based on the web page content and the input question

In [1]:
import time
import string
import math
from random import randint
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# For the model
#%load_ext autoreload
#%autoreload 2
#%matplotlib inline

from random import randint

import numpy as np
import torch

from models import InferSent

In [2]:
def get_sentence_tokens(url):
    # Scrape article using bs4 to extract all paragraphs from the online article.
    raw_html = urllib.request.urlopen(url)
    raw_html = raw_html.read()

    article_html = BeautifulSoup(raw_html, 'lxml')
    article_paragraphs = article_html.find_all('p')
    
    # Creating a document 'article_text' containing all the sentences in the article.
    article_text = ''
    for para in article_paragraphs:
        article_text += para.text
        
    # Tokenize article text into sentences.
    article_sentences = nltk.sent_tokenize(article_text)
    
    # Clean the article sentence to remove extra whitespaces and reference numbers (such as "[23]")
    for i in range(len(article_sentences)):
        article_sentences[i] = re.sub(r'\[\d+\]', '', article_sentences[i])
        article_sentences[i] = re.sub(r'\[\w\]', '', article_sentences[i])
        article_sentences[i] = re.sub(r'\s+', ' ', article_sentences[i]).strip()
    
    return article_sentences

In [3]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def euclidean_dist(u, v):
    return math.sqrt(sum([(a - b) ** 2 for a, b in zip(u, v)]))

In [4]:
def get_most_similar_sentences(question_vector, embeddings, sent_count):
    """Returns the most similar sentences to the question vector.
    Similarity Coefficient used: Cosine Index
    Sentence count refers to number of most similar sentences to be returned.
    """
    most_sim_sentences = []
    for sent_index, sent_vector in enumerate(embeddings):
        most_sim_sentences.append((sent_index, cosine(question_vector, sent_vector))) # appending a tuple
    most_sim_sentences.sort(key = lambda x: x[1], reverse = True)
    
    assert sent_count <= len(embeddings), 'Enter sent_count value less than or equal to {0}'.format(len(embeddings))
    return most_sim_sentences[:sent_count]

In [5]:
def prepare_model():
    # Load the InferSent model
    model_version = 1
    MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    
    # Set the GloVe directory path:
    W2V_PATH = '../word_vectors/glove/glove.6B.300d.txt'
    model.set_w2v_path(W2V_PATH)
    
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)
    
    return model

In [6]:
def generate_context(url, question):
    
    # Get the sentence tokens for the entire article text.
    article_sentences = get_sentence_tokens(url)
    
    # Get the prepared model using GloVe/InferSent embeddings as its vocabulary.
    model = prepare_model()
    
    # Encode sentences
    embeddings = model.encode(article_sentences, bsize=128, tokenize=False, verbose=True)
    
    # Encode the question
    question = [question]
    question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
    
    # Get most similar "N" sentence tokens i.e. sent_count
    most_sim_sentences = get_most_similar_sentences(question_vector, embeddings, sent_count = 30)
    
    # Build context paragraph.
    # Choose max_token_count such that total token count (question and context) is < 512.\
    context_list = []
    context_token_count = 0
    max_token_count = 400

    for sent_index, similarity_score in most_sim_sentences:
        sent_token_count = len(nltk.word_tokenize(article_sentences[sent_index]))
        if context_token_count + sent_token_count < max_token_count:
            context_list.append(article_sentences[sent_index])
            context_token_count += sent_token_count

    context_para = ' '.join(context_list)
    
    return context_para

In [7]:
url = 'https://en.wikipedia.org/wiki/India'
question = 'Which sports does India play?'
start_time = time.time()
context_para = generate_context(url, question)
end_time = time.time()

print('\nExecution time: ', end_time - start_time, ' seconds')

print('\n\nContext generated:\n', context_para)

Vocab size : 100000
Nb words kept : 7295/11303 (64.5%)
Speed : 50.9 sentences/s (cpu mode, bsize=128)
Nb words kept : 2/7 (28.6%)
Speed : 50.1 sentences/s (cpu mode, bsize=128)

Execution time:  29.600441932678223  seconds


Context generated:
 Cricket is the most popular sport in India. Other sports in which Indians have succeeded internationally include badminton (Saina Nehwal and P V Sindhu are two of the top-ranked female badminton players in the world), boxing, and wrestling. In India, several traditional indigenous sports remain fairly popular, such as kabaddi, kho kho, pehlwani and gilli-danda. In 1998, the BJP was able to form a successful coalition, the National Democratic Alliance (NDA). Corruption in India is perceived to have decreased. India has no national language. Major domestic competitions include the Indian Premier League, which is the most-watched cricket league in the world and ranks sixth among all sports leagues. Under the Guptas, a renewed Hinduism based on devo