### Baseline gensim word2vec and doc2vec models:
**Without using any pre-trained embeddings and trained only on the article text.**

In [162]:
import itertools
import time
import string
import random
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
import heapq
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

In [163]:
def clean_sentence(sentence):
    """Convert each sentence into lower case. 
    Extract English alphabets.
    Remove extra spaces.
    Strip leading/trailing whitespaces.
    Tokenize the sentence into word_tokens to generate "corpus" (a list of lists).
    """
    sentence = sentence.lower()
    sentence = re.sub(r'[^A-Za-z]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = sentence.strip()
    return sentence

### Pre-processing

In [164]:
url = 'https://en.wikipedia.org/wiki/India'

In [165]:
# Scrape article using bs4 to extract all paragraphs from the online article.
raw_html = urllib.request.urlopen(url)
raw_html = raw_html.read()

article_html = BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')

In [166]:
# Creating a document 'article_text' containing all the sentences in the article.
article_text = ''
for para in article_paragraphs:
    article_text += para.text

In [167]:
# Tokenize article text into sentences.
article_sentences = nltk.sent_tokenize(article_text)
initial_article_sentences = nltk.sent_tokenize(article_text)
corpus = []
for i in range(len(article_sentences)):
    article_sentences[i] = clean_sentence(article_sentences[i])
    corpus.append(nltk.word_tokenize(article_sentences[i]))

In [168]:
# Remove stopwords
stop_words = nltk.corpus.stopwords.words('english')
for i in range(len(corpus)):
    corpus[i] = [word for word in corpus[i] if word not in stop_words]

### Word2vec: Word Embeddings
Exploring performance at the word level

#### min_count = 2
A value of 2 for min_count specifies to include only those words in the Word2Vec model that appear at least twice in the corpus. 

In [169]:
word2vec_model1 = Word2Vec(corpus, min_count=2)
vocabulary = word2vec_model1.wv.vocab

In [170]:
print(len(vocabulary))
dict(itertools.islice(vocabulary.items(), 5))

1002


{'india': <gensim.models.keyedvectors.Vocab at 0x20fbb9c2cc8>,
 'republic': <gensim.models.keyedvectors.Vocab at 0x20fb7f06ac8>,
 'hindi': <gensim.models.keyedvectors.Vocab at 0x20fb7f060c8>,
 'bh': <gensim.models.keyedvectors.Vocab at 0x20fb7f06108>,
 'rat': <gensim.models.keyedvectors.Vocab at 0x20fb7f061c8>}

In [171]:
word2vec_model1.wv.most_similar('india')

[('worn', 0.46601375937461853),
 ('recorded', 0.38692715764045715),
 ('indian', 0.3731222450733185),
 ('front', 0.3705996870994568),
 ('regional', 0.36664581298828125),
 ('became', 0.3634414076805115),
 ('regions', 0.3467880189418793),
 ('ones', 0.3400145173072815),
 ('new', 0.33813297748565674),
 ('number', 0.33606159687042236)]

In [172]:
word2vec_model1.wv.most_similar('river')

[('wide', 0.30606400966644287),
 ('slow', 0.2742408215999603),
 ('traditionally', 0.2682422995567322),
 ('china', 0.2664911150932312),
 ('developed', 0.26531141996383667),
 ('peninsular', 0.2538667917251587),
 ('kerala', 0.25263863801956177),
 ('vijayanagara', 0.25194621086120605),
 ('ganges', 0.24134430289268494),
 ('coasts', 0.2312953919172287)]

#### min_count = 5
Train model with words which appear at least 5 times in the corpus.

In [173]:
word2vec_model2 = Word2Vec(corpus, min_count=5)

In [174]:
word2vec_model2.wv.most_similar('india')

[('indian', 0.5393590927124023),
 ('worn', 0.5122436881065369),
 ('increased', 0.49107420444488525),
 ('regions', 0.48476243019104004),
 ('bce', 0.4690588712692261),
 ('body', 0.45259493589401245),
 ('punjab', 0.4465717673301697),
 ('caused', 0.43843165040016174),
 ('indus', 0.42407089471817017),
 ('regional', 0.42382320761680603)]

In [175]:
word2vec_model2.wv.most_similar('river')

[('traditionally', 0.32871970534324646),
 ('wide', 0.30972668528556824),
 ('indus', 0.3035282492637634),
 ('peninsular', 0.28665006160736084),
 ('ganges', 0.2729552984237671),
 ('china', 0.26684269309043884),
 ('movement', 0.26488956809043884),
 ('india', 0.25716936588287354),
 ('billion', 0.25153660774230957),
 ('languages', 0.24989700317382812)]

### Doc2vec: Sentence Embeddings

In [176]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Generate a tagged corpus as training data to the doc2vec model i.e. each corpus list (containing the tokens of the sentence) is tagged with an integer ID.

In [178]:
tagged_corpus = [TaggedDocument(d, [i]) for i, d in enumerate(corpus)]

Training word2vec on tagged corpus:

In [179]:
doc2vec_model = Doc2Vec(tagged_corpus, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)

Peeking at the vocabulary:

In [180]:
doc2vec_vocabulary = doc2vec_model.wv.vocab
dict(itertools.islice(doc2vec_vocabulary.items(), 6))

{'india': <gensim.models.keyedvectors.Vocab at 0x20fbb9021c8>,
 'officially': <gensim.models.keyedvectors.Vocab at 0x20fbba0f888>,
 'republic': <gensim.models.keyedvectors.Vocab at 0x20fbb92e608>,
 'hindi': <gensim.models.keyedvectors.Vocab at 0x20fbb92e208>,
 'bh': <gensim.models.keyedvectors.Vocab at 0x20fbb9062c8>,
 'rat': <gensim.models.keyedvectors.Vocab at 0x20fbb8b59c8>}

Function to display the top-N most similar sentences w.r.t. the question:

In [189]:
def show_similar_sentences(question):
    clean_question = clean_sentence(question)
    question_tokens = nltk.word_tokenize(clean_question)
    question_tokens = [word for word in question_tokens if word not in stop_words]
    
    # most_similar() returns the tag number and the similarity score for the "topn" most similar sentences.
    # The "positive" attribute can take a single docvec (a list of 1 array) or multiple docvecs (a list of multiple arrays)
    # Here, one array is for a single sentence vector (or docvec)
    # If multiple docvecs are given, it takes the mean of the vectors. 
    # Cosine similarity is computed between the mean vector and the other vectors in the training data.
    similar_sentences = doc2vec_model.docvecs.most_similar(positive=[doc2vec_model.infer_vector(question_tokens)], topn=5)
    
    print("Question:\n{0}".format(question))
    print("Question tokens considered: {0}\n".format(question_tokens))
    print("Similar Sentences:\n")
    
    for i, sent in enumerate(similar_sentences):
        sentence = initial_article_sentences[sent[0]]
        sentence = re.sub(r'\s+', ' ', sentence)
        sentence = re.sub(r'\[\d+\]','' , sentence).strip()
        print("Sentence {0}: (Similarity Score = {1})".format(i+1, sent[1]))
        print(sentence, "\n")

### Testing

In [183]:
question = 'Where is India located?'
show_similar_sentences(question)

Question:
Where is India located?
Question tokens considered: ['india', 'located']

Similar Sentences:

Sentence 1: (Similarity Score = 0.922821044921875)
It predominates in the tropical moist forest of the Andaman Islands, the Western Ghats, and Northeast India. 

Sentence 2: (Similarity Score = 0.912075400352478)
India has two archipelagos: the Lakshadweep, coral atolls off India's south-western coast; and the Andaman and Nicobar Islands, a volcanic chain in the Andaman Sea. 

Sentence 3: (Similarity Score = 0.912002444267273)
[j] India's forest cover is 701,673 km2 (270,917 sq mi), which is 21.35% of the country's total land area. 

Sentence 4: (Similarity Score = 0.9063073396682739)
It predominates in the temperate coniferous forest of the Himalayas, the moist deciduous sal forest of eastern India, and the dry deciduous teak forest of central and southern India. 

Sentence 5: (Similarity Score = 0.9052748680114746)
India has traditionally been the dominant country at the South Asia

In [184]:
question = "Which are the neighbouring countries to India?"
show_similar_sentences(question)

Question:
Which are the neighbouring countries to India?
Question tokens considered: ['neighbouring', 'countries', 'india']

Similar Sentences:

Sentence 1: (Similarity Score = 0.882215142250061)
After initially cordial relations with neighbouring China, India went to war with China in 1962, and was widely thought to have been humiliated. 

Sentence 2: (Similarity Score = 0.8802105188369751)
India also contains four of the world's 34 biodiversity hotspots, or regions that display significant habitat loss in the presence of high endemism. 

Sentence 3: (Similarity Score = 0.8722947835922241)
It has disputes over Kashmir with its neighbours, Pakistan and China, unresolved since the mid-20th century. 

Sentence 4: (Similarity Score = 0.8674148321151733)
India is the world's most populous democracy. 

Sentence 5: (Similarity Score = 0.8655755519866943)
It has unresolved territorial disputes with China and with Pakistan. 



In [185]:
question = "Which sports does India play?"
show_similar_sentences(question)

Question:
Which sports does India play?
Question tokens considered: ['sports', 'india', 'play']

Similar Sentences:

Sentence 1: (Similarity Score = 0.9705215692520142)
The World Bank cautions that, for India to achieve its economic potential, it must continue to focus on public sector reform, transport infrastructure, agricultural and rural development, removal of labour regulations, education, energy security, and public health and nutrition. 

Sentence 2: (Similarity Score = 0.9698085188865662)
These included the consolidation and demarcation of sovereignty, the surveillance of the population, and the education of citizens. 

Sentence 3: (Similarity Score = 0.9696745276451111)
India has a comparatively strong presence in shooting sports, and has won several medals at the Olympics, the World Shooting Championships, and the Commonwealth Games. 

Sentence 4: (Similarity Score = 0.9696272611618042)
Among the socio-economic challenges India faces are gender inequality, child malnutrition

In [186]:
question = "What did the greek refer to Indians as?"
show_similar_sentences(question)

Question:
What did the greek refer to Indians as?
Question tokens considered: ['greek', 'refer', 'indians']

Similar Sentences:

Sentence 1: (Similarity Score = 0.9536331295967102)
However, barely 2% of Indians pay income taxes. 

Sentence 2: (Similarity Score = 0.9433398246765137)
The ancient Greeks referred to the Indians as Indoi (Ἰνδοί), which translates as "The people of the Indus". 

Sentence 3: (Similarity Score = 0.935443639755249)
There are around 50 physicians per 100,000 Indians. 

Sentence 4: (Similarity Score = 0.9347553253173828)
The human sex ratio, according to the 2011 census, is 940 females per 1,000 males. 

Sentence 5: (Similarity Score = 0.931684136390686)
An overwhelming majority of Indians, with their consent, have their marriages arranged by their parents or other family elders. 



In [187]:
question = "Approximately how many Indians served in the First World War?"
show_similar_sentences(question)

Question:
Approximately how many Indians served in the First World War?
Question tokens considered: ['approximately', 'many', 'indians', 'served', 'first', 'world', 'war']

Similar Sentences:

Sentence 1: (Similarity Score = 0.79473477602005)
Some 431 million Indians have left poverty since 1985; India's middle classes are projected to number around 580 million by 2030. 

Sentence 2: (Similarity Score = 0.7896140217781067)
According to the 2011 census, there were 10.1 million child labourers in the country, a decline of 2.6 million from 12.6 million in 2001. 

Sentence 3: (Similarity Score = 0.7694747447967529)
According to a 2016 Walk Free Foundation report there were an estimated 18.3 million people in India, or 1.4% of the population, living in the forms of modern slavery, such as bonded labour, child labour, human trafficking, and forced begging, among others. 

Sentence 4: (Similarity Score = 0.7649405002593994)
After World War I, in which approximately one million Indians served,