In [3]:
import itertools
import time
import string
import math
from random import randint
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

import sys
sys.path.append('C:/Users/niraje/Documents/MLG/Web-QnA/modules')
import models_infersent

In [4]:
def clean_sentence(sentence):
    """Convert each sentence into lower case. 
    Extract English alphabets.
    Remove extra spaces.
    Strip leading/trailing whitespaces.
    """
    sentence = sentence.lower()
    sentence = re.sub(r'[^A-Za-z]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = sentence.strip()
    return sentence

### Fetch the article text

In [5]:
url = 'https://en.wikipedia.org/wiki/India'

In [6]:
# Scrape article using bs4 to extract all paragraphs from the online article.
raw_html = urllib.request.urlopen(url)
raw_html = raw_html.read()

article_html = BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')

In [7]:
# Creating a document 'article_text' containing all the sentences in the article.
article_text = ''
for para in article_paragraphs:
    article_text += para.text

In [8]:
# Tokenize article text into sentences.
article_sentences = nltk.sent_tokenize(article_text)

### Preprocess sentences for InferSent encodings

In [9]:
# Clean the article sentence to remove extra whitespaces and reference numbers (such as "[23]")

for i in range(len(article_sentences)):
    article_sentences[i] = re.sub(r'\[\d+\]', '', article_sentences[i])
    article_sentences[i] = re.sub(r'\[\w\]', '', article_sentences[i])
    article_sentences[i] = re.sub(r'\s+', ' ', article_sentences[i]).strip()
article_sentences[:5]

['India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya), is a country in South Asia.',
 'It is the second-most populous country, the seventh-largest country by land area, and the most populous democracy in the world.',
 'Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east.',
 'In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia.',
 'Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.']

### Implementation of InferSent Sentence Encoder

In [10]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint

import numpy as np
import torch

### Load the model:
Load infersent model (version 1) which has been trained on GloVe embeddings

In [11]:
# Load model
from models_infersent import InferSent
model_version = 1
MODEL_PATH = "../../InferSent/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

### Set the GloVe directory path

In [12]:
W2V_PATH = '../../word_vectors/glove/glove.6B.300d.txt'
model.set_w2v_path(W2V_PATH)

In [13]:
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [14]:
# Number of article sentences to be encoded:
len(article_sentences)

443

### Encode sentences
* CPU Speed: ~100 sentences/sec
* GPU Speed: ~1000 sentences/sec

In [15]:
embeddings = model.encode(article_sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 7300/11310 (64.5%)
Speed : 49.9 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 443


In [16]:
embeddings[:2]

array([[0.09396838, 0.07308353, 0.04056723, ..., 0.01159299, 0.        ,
        0.05563534],
       [0.04758248, 0.03204281, 0.04894754, ..., 0.00389264, 0.        ,
        0.02934591]], dtype=float32)

In [17]:
embeddings.shape

(443, 4096)

### Exploring cosine similarity between any 2 sentences in the article
Note: model.visualize(article_sentences[randint(0, len(article_sentences))]) throws an error

In [18]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def euclidean_dist(u, v):
    return math.sqrt(sum([(a - b) ** 2 for a, b in zip(u, v)]))

In [19]:
random_sent1 = article_sentences[randint(0, len(article_sentences))]
random_sent2 = article_sentences[randint(0, len(article_sentences))]

cosine_sim = cosine(model.encode([random_sent1])[0], model.encode([random_sent2])[0])
euclidean_d_value = euclidean_dist(model.encode([random_sent1])[0], model.encode([random_sent2])[0])

print("Sentence 1:\n{0}\n\nSentence 2:\n{1}\n".format(random_sent1, random_sent2))
print("Cosine similarity = {0}\nEuclidean Distance = {1}".format(cosine_sim, euclidean_d_value))

Sentence 1:
Coastal features include the marshy Rann of Kutch of western India and the alluvial Sundarbans delta of eastern India; the latter is shared with Bangladesh.

Sentence 2:
Constituted in such fashion, India lies to the north of the equator between 6° 44′ and 35° 30′ north latitude and 68° 7′ and 97° 25′ east longitude.

Cosine similarity = 0.857245683670044
Euclidean Distance = 2.879567688290825


# Single Vector Approach - Cosine Similarity

Encoding the question

In [20]:
def get_most_similar_sentences(question_vector, sent_count):
    """Returns the most similar sentences to the question vector.
    Similarity Coefficient used: Cosine Index
    Sentence count refers to number of most similar sentences to be returned.
    """
    most_sim_sentences = []
    for sent_index, sent_vector in enumerate(embeddings):
        most_sim_sentences.append((sent_index, cosine(question_vector, sent_vector))) # appending a tuple
    most_sim_sentences.sort(key = lambda x: x[1], reverse = True)
    if sent_count <= len(embeddings):
        return most_sim_sentences[:sent_count]
    else:
        return most_sim_sentences[:len(embeddings)]

### Q1

In [21]:
question = 'Which are the neighbouring countries to India?'
question = [question]
question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
print('Question vector: {0}'.format(question_vector))
print('nb sentences encoded : {0}'.format(len(question)))

Nb words kept : 5/9 (55.6%)
Speed : 25.7 sentences/s (cpu mode, bsize=128)
Question vector: [ 0.04600282  0.15134372 -0.04607344 ...  0.0321859  -0.02943331
  0.05108219]
nb sentences encoded : 1


Get 5 most similar sentences

In [22]:
most_sim_sentences = get_most_similar_sentences(question_vector, 5)
print(most_sim_sentences)

[(76, 0.8174522), (265, 0.81478786), (204, 0.8145509), (291, 0.8134547), (440, 0.8075874)]


Print the most similar sentences

In [23]:
for sent_index, similarity_score in most_sim_sentences:
    print('Sentence Index {}, Similarity Score = {}:\n{}\n'.format(sent_index, similarity_score,
                                                                        article_sentences[sent_index]))

Sentence Index 76, Similarity Score = 0.8174521923065186:
They were imitated all over India and led to both the resurgence of Hinduism and the development of all modern languages of the subcontinent.

Sentence Index 265, Similarity Score = 0.8147878646850586:
However, it has remained lower than those of other Asian developing countries like Indonesia, Malaysia, Philippines, Sri Lanka, and Thailand, and is expected to remain so in the near future.

Sentence Index 204, Similarity Score = 0.8145508766174316:
All states, as well as the union territories of Jammu and Kashmir, Puducherry and the National Capital Territory of Delhi, have elected legislatures and governments following the Westminster system of governance.

Sentence Index 291, Similarity Score = 0.8134546875953674:
Corruption in India is perceived to have decreased.

Sentence Index 440, Similarity Score = 0.8075873851776123:
India has traditionally been the dominant country at the South Asian Games.



### Q2

In [24]:
question = 'Which sports does India play?'
question = [question]
question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
print('Question vector: {0}'.format(question_vector))
print('nb sentences encoded : {0}'.format(len(question)))

Nb words kept : 2/7 (28.6%)
Speed : 55.7 sentences/s (cpu mode, bsize=128)
Question vector: [ 0.12260894 -0.02809364 -0.06930935 ... -0.04209305 -0.0116544
 -0.00440081]
nb sentences encoded : 1


In [25]:
most_sim_sentences = get_most_similar_sentences(question_vector, 5)
for sent_index, similarity_score in most_sim_sentences:
    print('Sentence Index {}, Similarity Score = {}:\n{}\n'.format(sent_index, similarity_score,
                                                                        article_sentences[sent_index]))

Sentence Index 435, Similarity Score = 0.6431796550750732:
Cricket is the most popular sport in India.

Sentence Index 427, Similarity Score = 0.6050891280174255:
In India, several traditional indigenous sports remain fairly popular, such as kabaddi, kho kho, pehlwani and gilli-danda.

Sentence Index 440, Similarity Score = 0.6028776168823242:
India has traditionally been the dominant country at the South Asian Games.

Sentence Index 291, Similarity Score = 0.5923976898193359:
Corruption in India is perceived to have decreased.

Sentence Index 433, Similarity Score = 0.5900624990463257:
Other sports in which Indians have succeeded internationally include badminton (Saina Nehwal and P V Sindhu are two of the top-ranked female badminton players in the world), boxing, and wrestling.



### Q3

In [26]:
question = 'Approximately how many Indians served in the First World War?'
question = [question]
question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
print('Question vector: {0}'.format(question_vector))
print('nb sentences encoded : {0}'.format(len(question)))

Nb words kept : 5/12 (41.7%)
Speed : 24.5 sentences/s (cpu mode, bsize=128)
Question vector: [ 0.09767815  0.07293532 -0.00092746 ...  0.00389264 -0.03927635
  0.02507453]
nb sentences encoded : 1


In [27]:
most_sim_sentences = get_most_similar_sentences(question_vector, 5)
for sent_index, similarity_score in most_sim_sentences:
    print('Sentence Index {}, Similarity Score = {}:\n{}\n'.format(sent_index, similarity_score,
                                                                        article_sentences[sent_index]))

Sentence Index 440, Similarity Score = 0.846401572227478:
India has traditionally been the dominant country at the South Asian Games.

Sentence Index 204, Similarity Score = 0.8330858945846558:
All states, as well as the union territories of Jammu and Kashmir, Puducherry and the National Capital Territory of Delhi, have elected legislatures and governments following the Westminster system of governance.

Sentence Index 335, Similarity Score = 0.8261755704879761:
In the 20th century, Indian literature was influenced by the works of the Bengali poet and novelist Rabindranath Tagore, who was a recipient of the Nobel Prize in Literature.

Sentence Index 215, Similarity Score = 0.8248085975646973:
In recent years, it has played key roles in the South Asian Association for Regional Cooperation and the World Trade Organization.

Sentence Index 76, Similarity Score = 0.82387375831604:
They were imitated all over India and led to both the resurgence of Hinduism and the development of all modern

### Q4

In [28]:
question = "What did the greek refer to Indians as?"
question = [question]
question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
print('Question vector: {0}'.format(question_vector))
print('nb sentences encoded : {0}'.format(len(question)))

Nb words kept : 5/10 (50.0%)
Speed : 24.5 sentences/s (cpu mode, bsize=128)
Question vector: [ 0.08511851  0.03973034  0.07982003 ...  0.00812555 -0.02943331
  0.00681409]
nb sentences encoded : 1


In [29]:
most_sim_sentences = get_most_similar_sentences(question_vector, 5)
for sent_index, similarity_score in most_sim_sentences:
    print('Sentence Index {}, Similarity Score = {}:\n{}\n'.format(sent_index, similarity_score,
                                                                        article_sentences[sent_index]))

Sentence Index 65, Similarity Score = 0.7895320057868958:
Under the Guptas, a renewed Hinduism based on devotion, rather than the management of ritual, began to assert itself.

Sentence Index 189, Similarity Score = 0.781525194644928:
In 1998, the BJP was able to form a successful coalition, the National Democratic Alliance (NDA).

Sentence Index 88, Similarity Score = 0.7803148031234741:
The resulting Mughal Empire did not stamp out the local societies it came to rule.

Sentence Index 38, Similarity Score = 0.7792684435844421:
The ancient Greeks referred to the Indians as Indoi (Ἰνδοί), which translates as "The people of the Indus".

Sentence Index 335, Similarity Score = 0.7666915059089661:
In the 20th century, Indian literature was influenced by the works of the Bengali poet and novelist Rabindranath Tagore, who was a recipient of the Nobel Prize in Literature.



## Model Evaluation - Single Vector - Cosine Distance

In [30]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    #print(pred_tokens)
    #print(truth_tokens)
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    #print(common_tokens)
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [31]:
import json

with open('val_questions.json', 'r') as fp:
    val_questions = json.load(fp)
    
with open('val_answers.json', 'r') as fp:
    val_answers = json.load(fp)

In [32]:
%%capture
pred_answers = dict()
for qid, question in val_questions.items():
    question = [question]
    question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
    most_sim_sentences = get_most_similar_sentences(question_vector, 5)
    # Consider the first most similar as the predicted answer for the qid.
    pred_answer_index = most_sim_sentences[0][0]
    pred_answers[qid] = article_sentences[pred_answer_index]

In [33]:
# Saving predictions
with open('infersent_glove_single_vector_cosine_pred_answers.json', 'w') as fp:
    json.dump(pred_answers, fp)

In [34]:
f1_scores = []
em_scores = []

for qid, pred_ans in pred_answers.items():
    true_ans = val_answers[qid]
    f1_score = compute_f1(pred_ans, true_ans)
    em_score = compute_exact_match(pred_ans, true_ans)
    
    f1_scores.append(f1_score)
    em_scores.append(em_score)

avg_f1 = sum(f1_scores) / len(f1_scores)
avg_em = sum(em_scores) / len(em_scores)

print('\nAvg F1 Score: {}'.format(avg_f1))
print('\nAvg EM Score: {}'.format(avg_em))


Avg F1 Score: 0.6797236977138025

Avg EM Score: 0.68


# -------------------------- Single Vector - Euclidean Distance ---------------------------

### Example:

In [35]:
def get_most_similar_sentences_euclidean(question_vector, sent_count):
    """Returns the most similar sentences to the question vector.
    Similarity Coefficient used: Euclidean Distance
    Sentence count refers to number of most similar sentences to be returned.
    """
    most_sim_sentences = []
    for sent_index, sent_vector in enumerate(embeddings):
        most_sim_sentences.append((sent_index, euclidean_dist(question_vector, sent_vector))) # appending a tuple
    most_sim_sentences.sort(key = lambda x: x[1], reverse = False) # sort direction = ascending
    if sent_count <= len(embeddings):
        return most_sim_sentences[:sent_count]
    else:
        print('Enter value less than or equal to {0}'.format(len(embeddings)))

In [36]:
question = 'Which are the neighbouring countries to India?'
question = [question]
question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
print('Question vector: {0}'.format(question_vector))
print('nb sentences encoded : {0}'.format(len(question)))

Nb words kept : 5/9 (55.6%)
Speed : 25.7 sentences/s (cpu mode, bsize=128)
Question vector: [ 0.04600282  0.15134372 -0.04607344 ...  0.0321859  -0.02943331
  0.05108219]
nb sentences encoded : 1


In [37]:
most_sim_sentences = get_most_similar_sentences_euclidean(question_vector, 5) # note the 2
print(most_sim_sentences)

[(291, 2.616865479370186), (440, 2.807772790459424), (231, 2.81859848188907), (105, 2.8372004735565755), (0, 2.867852769122867)]


In [38]:
i = 0
for sent_index, similarity_score in most_sim_sentences:
    i += 1
    print('{}. Sentence Index {}, Similarity Score = {}:\n{}\n'.format(i, sent_index, similarity_score,
                                                                        article_sentences[sent_index]))

1. Sentence Index 291, Similarity Score = 2.616865479370186:
Corruption in India is perceived to have decreased.

2. Sentence Index 440, Similarity Score = 2.807772790459424:
India has traditionally been the dominant country at the South Asian Games.

3. Sentence Index 231, Similarity Score = 2.81859848188907:
It comprises the Indian Army, the Indian Navy, the Indian Air Force, and the Indian Coast Guard.

4. Sentence Index 105, Similarity Score = 2.8372004735565755:
Technological changes—among them, railways, canals, and the telegraph—were introduced not long after their introduction in Europe.

5. Sentence Index 0, Similarity Score = 2.867852769122867:
India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya), is a country in South Asia.



### Evaluation

In [39]:
import json

with open('val_questions.json', 'r') as fp:
    val_questions = json.load(fp)
    
with open('val_answers.json', 'r') as fp:
    val_answers = json.load(fp)

In [40]:
#%%capture
pred_answers_euclidean = dict()
for qid, question in val_questions.items():
    question = [question]
    question_vector = model.encode(question, bsize=128, tokenize=False, verbose=True)[0]
    most_sim_sentences = get_most_similar_sentences_euclidean(question_vector, 5) #notice euclidean
    # Consider the first most similar as the predicted answer for the qid.
    pred_answer_index = most_sim_sentences[0][0]
    pred_answers_euclidean[qid] = article_sentences[pred_answer_index]

Nb words kept : 1/5 (20.0%)
Speed : 100.7 sentences/s (cpu mode, bsize=128)
Nb words kept : 6/11 (54.5%)
Speed : 23.3 sentences/s (cpu mode, bsize=128)
Nb words kept : 11/16 (68.8%)
Speed : 14.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 11/15 (73.3%)
Speed : 13.2 sentences/s (cpu mode, bsize=128)
Nb words kept : 12/20 (60.0%)
Speed : 12.2 sentences/s (cpu mode, bsize=128)
Nb words kept : 6/11 (54.5%)
Speed : 24.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/8 (37.5%)
Speed : 43.7 sentences/s (cpu mode, bsize=128)
Nb words kept : 14/18 (77.8%)
Speed : 10.8 sentences/s (cpu mode, bsize=128)
Nb words kept : 19/26 (73.1%)
Speed : 8.2 sentences/s (cpu mode, bsize=128)
Nb words kept : 7/14 (50.0%)
Speed : 13.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/14 (71.4%)
Speed : 15.7 sentences/s (cpu mode, bsize=128)
Nb words kept : 14/19 (73.7%)
Speed : 11.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 4/10 (40.0%)
Speed : 35.8 sentences/s (cpu mode, bsize=128)
Nb word

In [43]:
# Saving predictions
with open('infersent_glove_single_vector_euclidean_pred_answers.json', 'w') as fp:
    json.dump(pred_answers_euclidean, fp)

In [44]:
f1_scores = []
em_scores = []

for qid, pred_ans in pred_answers_euclidean.items():
    true_ans = val_answers[qid]
    f1_score = compute_f1(pred_ans, true_ans)
    em_score = compute_exact_match(pred_ans, true_ans)
    
    f1_scores.append(f1_score)
    em_scores.append(em_score)

avg_f1 = sum(f1_scores) / len(f1_scores)
avg_em = sum(em_scores) / len(em_scores)

print('\nAvg F1 Score: {}'.format(avg_f1))
print('\nAvg EM Score: {}'.format(avg_em))


Avg F1 Score: 0.6005474560255606

Avg EM Score: 0.59
