In [1]:
import itertools
import time
import string
import math
from random import randint
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

### Fetch the article text

In [5]:
url = 'https://en.wikipedia.org/wiki/India'

In [6]:
# Scrape article using bs4 to extract all paragraphs from the online article.
raw_html = urllib.request.urlopen(url)
raw_html = raw_html.read()

article_html = BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')

In [7]:
# Creating a document 'article_text' containing all the sentences in the article.
article_text = ''
for para in article_paragraphs:
    article_text += para.text

In [8]:
# Tokenize article text into sentences.
article_sentences = nltk.sent_tokenize(article_text)

### Preprocess sentences for InferSent encodings

In [9]:
# Clean the article sentence to remove extra whitespaces and reference numbers (such as "[23]")

for i in range(len(article_sentences)):
    article_sentences[i] = re.sub(r'\[\d+\]', '', article_sentences[i])
    article_sentences[i] = re.sub(r'\[\w\]', '', article_sentences[i])
    article_sentences[i] = re.sub(r'\s+', ' ', article_sentences[i]).strip()
article_sentences[:5]

['India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya), is a country in South Asia.',
 'It is the second-most populous country, the seventh-largest country by land area, and the most populous democracy in the world.',
 'Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east.',
 'In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia.',
 'Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.']

### Implementation of InferSent Sentence Encoder

In [10]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint

import numpy as np
import torch

### Load the model:
Load infersent model (version 1) which has been trained on GloVe embeddings

In [11]:
# Load model
from models import InferSent
model_version = 1
MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

### Set the GloVe directory path

In [12]:
W2V_PATH = '../word_vectors/glove/glove.6B.300d.txt'
model.set_w2v_path(W2V_PATH)

In [13]:
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [14]:
# Number of article sentences to be encoded:
len(article_sentences)

443

### Encode sentences
* CPU Speed: ~100 sentences/sec
* GPU Speed: ~1000 sentences/sec

In [15]:
embeddings = model.encode(article_sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 7293/11301 (64.5%)
Speed : 57.8 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 443


In [16]:
embeddings[:2]

array([[0.09396838, 0.07308353, 0.04056723, ..., 0.01159299, 0.        ,
        0.05563534],
       [0.04758248, 0.03204281, 0.04894754, ..., 0.00389264, 0.        ,
        0.02934591]], dtype=float32)

In [17]:
embeddings.shape

(443, 4096)

### Exploring cosine similarity between any 2 sentences in the article
Note: model.visualize(article_sentences[randint(0, len(article_sentences))]) throws an error

In [18]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def euclidean_dist(u, v):
    return math.sqrt(sum([(a - b) ** 2 for a, b in zip(u, v)]))

In [19]:
random_sent1 = article_sentences[randint(0, len(article_sentences))]
random_sent2 = article_sentences[randint(0, len(article_sentences))]

cosine_sim = cosine(model.encode([random_sent1])[0], model.encode([random_sent2])[0])
euclidean_d_value = euclidean_dist(model.encode([random_sent1])[0], model.encode([random_sent2])[0])

print("Sentence 1:\n{0}\n\nSentence 2:\n{1}\n".format(random_sent1, random_sent2))
print("Cosine similarity = {0}\nEuclidean Distance = {1}".format(cosine_sim, euclidean_d_value))

Sentence 1:
Indian movies, music, and spiritual teachings play an increasing role in global culture.

Sentence 2:
Historians consider India's modern age to have begun sometime between 1848 and 1885.

Cosine similarity = 0.8358312845230103
Euclidean Distance = 2.948698259397073


#### Function to find the 'N' most similar sentences to the question sentence

In [20]:
def get_most_similar_sentences(question_vector, sent_count):
    """Returns the most similar sentences to the question vector.
    Similarity Coefficient used: Cosine Index
    Sentence count refers to number of most similar sentences to be returned.
    """
    most_sim_sentences = []
    for sent_index, sent_vector in enumerate(embeddings):
        most_sim_sentences.append((sent_index, cosine(question_vector, sent_vector))) # appending a tuple
    most_sim_sentences.sort(key = lambda x: x[1], reverse = True)
    if sent_count <= len(embeddings):
        return most_sim_sentences[:sent_count]
    else:
        print('Enter value less than or equal to {0}'.format(len(embeddings)))

### Testing for a question

In [115]:
# Encode the question

question = 'Which sports does India play?'
question = [question]
question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
print('Question vector: {0}'.format(question_vector))
print('nb sentences encoded : {0}'.format(len(question)))

Nb words kept : 2/7 (28.6%)
Speed : 21.2 sentences/s (cpu mode, bsize=128)
Question vector: [ 0.12260894 -0.02809364 -0.06930935 ... -0.04209305 -0.0116544
 -0.00440081]
nb sentences encoded : 1


Get 5 most similar sentences

In [116]:
most_sim_sentences = get_most_similar_sentences(question_vector, 30)
print(most_sim_sentences)

[(435, 0.64317966), (427, 0.6050891), (291, 0.5923977), (433, 0.5900625), (313, 0.5883014), (436, 0.5875959), (390, 0.5873149), (65, 0.58635235), (267, 0.58587784), (396, 0.58271354), (88, 0.57939845), (321, 0.5772279), (357, 0.5771875), (372, 0.5771079), (441, 0.5763645), (363, 0.5749421), (386, 0.5719792), (368, 0.5717528), (185, 0.5707856), (440, 0.56976664), (350, 0.5690096), (389, 0.56888735), (262, 0.5687984), (378, 0.5686894), (191, 0.5679551), (276, 0.5666871), (203, 0.56623876), (353, 0.5659143), (67, 0.5653413), (175, 0.56526786)]


Print the most similar sentences

In [117]:
for sent_index, similarity_score in most_sim_sentences:
    print('Sentence Index {}, Similarity Score = {}:\n{}\n'.format(sent_index, similarity_score,
                                                                        article_sentences[sent_index]))

Sentence Index 435, Similarity Score = 0.6431796550750732:
Cricket is the most popular sport in India.

Sentence Index 427, Similarity Score = 0.6050891280174255:
In India, several traditional indigenous sports remain fairly popular, such as kabaddi, kho kho, pehlwani and gilli-danda.

Sentence Index 291, Similarity Score = 0.5923976898193359:
Corruption in India is perceived to have decreased.

Sentence Index 433, Similarity Score = 0.5900624990463257:
Other sports in which Indians have succeeded internationally include badminton (Saina Nehwal and P V Sindhu are two of the top-ranked female badminton players in the world), boxing, and wrestling.

Sentence Index 313, Similarity Score = 0.588301420211792:
India has no national language.

Sentence Index 436, Similarity Score = 0.5875958800315857:
Major domestic competitions include the Indian Premier League, which is the most-watched cricket league in the world and ranks sixth among all sports leagues.

Sentence Index 390, Similarity Sco

### Building the "context" paragraph for the question

In [123]:
context_list = []
context_token_count = 0
max_token_count = 400

In [124]:
for sent_index, similarity_score in most_sim_sentences:
    sent_token_count = len(nltk.word_tokenize(article_sentences[sent_index]))
    if context_token_count + sent_token_count < max_token_count:
        context_list.append(article_sentences[sent_index])
        context_token_count += sent_token_count

print(len(context_list))

18


In [125]:
context_para = ' '.join(context_list)

In [126]:
context_para

"Cricket is the most popular sport in India. In India, several traditional indigenous sports remain fairly popular, such as kabaddi, kho kho, pehlwani and gilli-danda. Corruption in India is perceived to have decreased. Other sports in which Indians have succeeded internationally include badminton (Saina Nehwal and P V Sindhu are two of the top-ranked female badminton players in the world), boxing, and wrestling. India has no national language. Major domestic competitions include the Indian Premier League, which is the most-watched cricket league in the world and ranks sixth among all sports leagues. The kameez may have a European-style collar, a Mandarin-collar, or it may be collarless; in the latter case, its design as a women's garment is similar to a kurta. Under the Guptas, a renewed Hinduism based on devotion, rather than the management of ritual, began to assert itself. According to a 2011 PricewaterhouseCoopers (PwC) report, India's GDP at purchasing power parity could overtake

In [127]:
len(nltk.word_tokenize(context_para))

396