In [1]:
import os
import json
import requests
from gensim import corpora, models, similarities
from time import time
from nltk.tokenize import sent_tokenize,word_tokenize,regexp_tokenize
import pandas as pd
import operator
import numpy as np
import scipy.stats as ss
import itertools
from nltk.corpus import stopwords

In [2]:
df_squad_train = pd.read_json('squad_train_doc.json')
df_squad_dev = pd.read_json('squad_dev_doc.json')
df_dev = pd.read_json('dev-v1.1.json')
df_train = pd.read_json('train-v1.1.json')

In [3]:
df_squad_train.rename(columns={'passages':'documents'}, inplace=True)

In [4]:
df_squad_train.head()

Unnamed: 0,documents,title
0,"[{'context': 'Architecturally, the school has ...",University_of_Notre_Dame
1,[{'context': 'Beyoncé Giselle Knowles-Carter (...,Beyoncé
2,[{'context': 'Montana i/mɒnˈtænə/ is a state i...,Montana
3,"[{'context': 'The phrase ""in whole or in part""...",Genocide
4,[{'context': 'The emergence of resistance of b...,Antibiotics


In [5]:
# rearranging the dataframe such that it is easy to access (splitting from columns)
df_squad_train_new = df_squad_train.groupby('title').documents.apply(lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis = 1)

In [6]:
df_squad_train_new.rename(columns={'questions':'question set'}, inplace=True)

In [7]:
df_squad_train_new

Unnamed: 0,title,context,question set
0,2008_Sichuan_earthquake,The 2008 Sichuan earthquake or the Great Sichu...,"[What was the earthquake named?, What did the ..."
1,2008_Sichuan_earthquake,It is also known as the Wenchuan earthquake (C...,"[What was the focal depth of the quake?, How l..."
2,2008_Sichuan_earthquake,"Official figures (as of July 21, 2008 12:00 CS...","[How many were injured in Sichuan?, How many p..."
3,2008_Sichuan_earthquake,The earthquake had a magnitude of 8.0 Ms and 7...,"[What percentage of buildings were destroyed?,..."
4,2008_Sichuan_earthquake,According to a study by the China Earthquake A...,[What was the most displacement caused by the ...
5,2008_Sichuan_earthquake,Malaysia-based Yazhou Zhoukan conducted an int...,[What did Geng long try to establish as a rela...
6,2008_Sichuan_earthquake,In a United States Geological Survey (USGS) st...,[How long was the fault where the quake occurr...
7,2008_Sichuan_earthquake,Japanese seismologist Yuji Yagi at the Univers...,"[Besides the population density, what else con..."
8,2008_Sichuan_earthquake,"Between 64 and 104 major aftershocks, ranging ...",[How many shocks ranged from 4.0 MS to 4.9 MS?...
9,2008_Sichuan_earthquake,"(The Ms 6.1 earthquake on August 30, 2008 in s...","[Where was the August 30, 2008 quake?, Why was..."


In [8]:
title_list = list(df_squad_train_new.title.unique())


In [9]:
title_list

['2008_Sichuan_earthquake',
 '2008_Summer_Olympics_torch_relay',
 '51st_state',
 'ASCII',
 'A_cappella',
 'Adolescence',
 'Adult_contemporary_music',
 'Affirmative_action_in_the_United_States',
 'Age_of_Enlightenment',
 'Aircraft_carrier',
 'Airport',
 'Alaska',
 'Alexander_Graham_Bell',
 'Alfred_North_Whitehead',
 'Alloy',
 'Alps',
 'Alsace',
 'American_Idol',
 'Animal',
 'Ann_Arbor,_Michigan',
 'Annelid',
 'Antarctica',
 'Antenna_(radio)',
 'Anthropology',
 'Anti-aircraft_warfare',
 'Antibiotics',
 'Apollo',
 'Appalachian_Mountains',
 'Architecture',
 'Arena_Football_League',
 'Armenia',
 'Armenians',
 'Arnold_Schwarzenegger',
 'Arsenal_F.C.',
 'Ashkenazi_Jews',
 'Asphalt',
 'Aspirated_consonant',
 'Association_football',
 'Asthma',
 'Athanasius_of_Alexandria',
 'Atlantic_City,_New_Jersey',
 'Avicenna',
 'BBC_Television',
 'Bacteria',
 'Baptists',
 'Beer',
 'BeiDou_Navigation_Satellite_System',
 'Bermuda',
 'Bern',
 'Beyoncé',
 'Bill_%26_Melinda_Gates_Foundation',
 'Biodiversity',
 '

In [10]:
## checking questions
df_squad_train_new.loc[df_squad_train_new['title'] == 'Antibiotics', 'question set'][1302]

['What does emergence of resistance reflect?',
 'Who made the demonstration in 1943?',
 'When was antibacterial-resistance demonstrated?',
 'What is the purpose of antibiotic treatment?',
 'What is resistance to antibiotics a cause of?',
 'When was the Luria-Delbruck experiment?',
 'What is a modern common occurence with antibiotics?',
 'Which two antibiotics that have high efficacy are much less useful now?']

### list of context words and question words

In [11]:

context_list2 = []
for i in title_list:
    king = []
    context_words = df_squad_train_new.loc[df_squad_train_new['title'] == i, 'context']
    question_words = df_squad_train_new.loc[df_squad_train_new['title'] == i, 'question set']
    context_words1 = list(context_words)
    question_words1 = list(question_words)
    question_words2 = list(itertools.chain.from_iterable(question_words1))
    for j in range(len(context_words1)):
        context_words2 = word_tokenize(context_words1[j])
        king += context_words2 
    for d in range(len(question_words2)):
        question_words3 = word_tokenize(question_words2[d])
        king += question_words3        
    context_list2.append(king)    
    

In [12]:
len(context_list2[0])

12957

### removing stop words from list of context words and question words

In [13]:
stop_words = set(stopwords.words("english"))

In [14]:
filtered = []
for p in range(len(context_list2)):
    inner = []
    for q in (context_list2[p]):
        if q not in stop_words:
            inner.append(q)
    filtered.append(inner)        

In [15]:
len(filtered[0]) 

8923

In [16]:
title_list.index("Freemasonry")

149

In [28]:
def training(doc_list):
    dictionary = corpora.Dictionary(doc_list)
    dictionary.save('/tmp/wipro.dict') # store the dictionary, for future reference
    raw_corpus = [dictionary.doc2bow(t) for t in doc_list]
    corpora.MmCorpus.serialize('/tmp/wipro.mm', raw_corpus) # store to disk
    dictionary = corpora.Dictionary.load('/tmp/wipro.dict')
    corpus = corpora.MmCorpus('/tmp/wipro.mm')
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    index = similarities.MatrixSimilarity(tfidf[corpus])
    index.save('/tmp/wipro.index')
    index = similarities.MatrixSimilarity.load('/tmp/wipro.index')
    sims = index[corpus_tfidf]
    return corpus_tfidf,index,tfidf,sims,dictionary,corpus

In [29]:
corpus_tfidf,index,tfidf,sims,dictionary,corpus = training(filtered)

In [55]:
(corpus[1])

[(4, 1.0),
 (6, 5.0),
 (11, 556.0),
 (14, 2.0),
 (15, 1.0),
 (21, 1.0),
 (24, 53.0),
 (35, 1.0),
 (37, 2.0),
 (38, 11.0),
 (39, 2.0),
 (42, 3.0),
 (45, 4.0),
 (46, 1.0),
 (48, 1.0),
 (50, 13.0),
 (54, 1.0),
 (55, 1.0),
 (56, 7.0),
 (60, 1.0),
 (61, 4.0),
 (65, 5.0),
 (71, 5.0),
 (73, 11.0),
 (75, 12.0),
 (77, 3.0),
 (78, 9.0),
 (82, 1.0),
 (91, 6.0),
 (92, 4.0),
 (95, 30.0),
 (103, 3.0),
 (112, 10.0),
 (115, 1.0),
 (120, 3.0),
 (122, 2.0),
 (123, 1.0),
 (124, 1.0),
 (126, 5.0),
 (127, 2.0),
 (130, 28.0),
 (141, 1.0),
 (142, 3.0),
 (143, 141.0),
 (146, 3.0),
 (147, 13.0),
 (150, 2.0),
 (152, 1.0),
 (154, 2.0),
 (162, 2.0),
 (163, 2.0),
 (171, 1.0),
 (178, 9.0),
 (180, 9.0),
 (181, 1.0),
 (183, 14.0),
 (184, 2.0),
 (185, 1.0),
 (187, 6.0),
 (189, 1.0),
 (200, 37.0),
 (201, 10.0),
 (202, 2.0),
 (204, 3.0),
 (210, 3.0),
 (214, 2.0),
 (216, 3.0),
 (217, 2.0),
 (219, 5.0),
 (220, 1.0),
 (221, 1.0),
 (223, 2.0),
 (226, 7.0),
 (227, 3.0),
 (228, 1.0),
 (232, 7.0),
 (235, 1.0),
 (236, 6.0),
 (2

In [41]:
len(dictionary)

115880

In [53]:
len(corpus_tfidf[0])

2379

In [44]:
print(corpus_tfidf.corpus)

MmCorpus(442 documents, 115880 features, 735206 non-zero entries)


In [46]:
index.index.shape

(442, 115880)

In [24]:
sims

array([[ 1.        ,  0.0728653 ,  0.01364303, ...,  0.01118015,
         0.03389601,  0.00363547],
       [ 0.0728653 ,  0.99999994,  0.00928876, ...,  0.00554808,
         0.01816067,  0.00138667],
       [ 0.01364303,  0.00928876,  1.00000012, ...,  0.00775558,
         0.00850634,  0.00236766],
       ..., 
       [ 0.01118015,  0.00554808,  0.00775558, ...,  1.00000095,
         0.00232097,  0.00125903],
       [ 0.03389601,  0.01816067,  0.00850634, ...,  0.00232097,
         0.99999946,  0.00214126],
       [ 0.00363547,  0.00138667,  0.00236766, ...,  0.00125903,
         0.00214126,  1.00000024]], dtype=float32)

In [46]:
def query_of_similarity(t):
    text = (word_tokenize(t))
    raw_corpus2 = [dictionary.doc2bow(t) for t in [text]]
    corpora.MmCorpus.serialize('/tmp/rawcorpus2.mm', raw_corpus2) 
    dictionary2 = corpora.Dictionary.load('/tmp/query.dict')
    corpus2 = corpora.MmCorpus('/tmp/rawcorpus2.mm')
    r = index.get_similarities(tfidf[corpus2])
    return r

    

In [112]:
def query(text):
    doc_selection = query_of_similarity(text)
    #print(doc_selection)
    doc_pick, value = max(enumerate(doc_selection), key=operator.itemgetter(1))
    #print(doc_selection)
    doc_1 = doc_selection.argmax()
    #print(doc_1)
    top_n_similiar = (-doc_selection).argsort()[:5]
    similiar_n = []
    for num in range(0,5):
        similiar_n.append(top_n_similiar[0][num])
    #print(similiar_n)    
    ranks = ss.rankdata(doc_selection[0])
    return similiar_n
    #print(ranks)
    #print(doc_pick)
    

In [109]:
query("""Since the middle of the 19th century, Masonic historians have sought the origins of the movement in a series of similar documents known as the Old Charges, dating from the Regius Poem in about 1425 to the beginning of the 18th century. Alluding to the membership of a lodge of operative masons, they relate a mythologised history of the craft, the duties of its grades, and the manner in which oaths of fidelity are to be taken on joining. The fifteenth century also sees the first evidence of ceremonial regalia.""")

[[  7.87236029e-04   1.87136873e-03   2.43626256e-03   5.87037415e-04
    2.17790902e-03   3.17360484e-03   3.90908157e-04   2.03642994e-03
    3.21249627e-02   1.18848286e-03   3.34376004e-04   1.28863868e-03
    2.04464258e-03   1.66009343e-03   3.75129748e-04   5.40747494e-03
    1.56155101e-03   2.37056008e-03   1.30042876e-03   6.20437204e-04
    6.70911337e-04   1.47184636e-03   7.48400984e-04   2.39233486e-03
    4.08420572e-04   2.47496791e-04   8.67594848e-04   4.29918757e-04
    1.22946072e-02   3.28586029e-04   1.96043705e-03   1.20280520e-03
    5.95504360e-04   4.30653861e-04   4.15728334e-03   1.41273066e-03
    2.59658234e-04   2.16295780e-03   1.29187747e-03   1.34270068e-03
    7.20749842e-04   2.57426582e-04   4.54162131e-04   2.48360477e-04
    5.46189491e-03   1.70051632e-03   8.56438302e-04   8.83336179e-04
    2.02076999e-03   1.39561438e-04   5.23965457e-04   1.60620373e-03
    1.81881851e-03   2.71561136e-03   5.65855950e-03   1.45583740e-03
    1.85331190e-03  

[149, 292, 8, 377, 361]

### Accuracy function 

In [113]:
def accuracy(no_of_docs):
    correct_predict = 0
    train_set = []
    for i in range(no_of_docs):
        doc_name = title_list[i]
        questions_list = df_squad_train_new.loc[df_squad_train_new['title'] == doc_name, 'question set']
        ql_pdf = list(questions_list)
        questions_set = (list(itertools.chain.from_iterable(ql_pdf)))
        contexts_list = df_squad_train_new.loc[df_squad_train_new['title'] == doc_name, 'context']
        contexts_set = list(contexts_list)
        qc_set = questions_set + contexts_set
        train_set += qc_set
        for j in qc_set:
            predicted_doc_num = query(j)
            for k in range(len(predicted_doc_num)):
                if(predicted_doc_num[k] == i):
                    correct_predict += 1
    print(correct_predict,"----",len(train_set))        
    accuracy = correct_predict/(len(train_set))
    print(accuracy)

In [115]:
accuracy(2)

1035 ---- 1167
0.8868894601542416


In [88]:
accuracy2(2)

0.78125


In [78]:
# questions and contexts
train2_set = []
for i in range(2):
        doc_name = title_list[i]
        questions_list = df_squad_train_new.loc[df_squad_train_new['title'] == doc_name, 'question set']
        ql_pdf = list(questions_list)
        questions_set = (list(itertools.chain.from_iterable(ql_pdf)))
        contexts_list = df_squad_train_new.loc[df_squad_train_new['title'] == doc_name, 'context']
        contexts_set = list(contexts_list)
        train_set = questions_set + contexts_set
        train2_set += train_set
train2_set       

['What was the earthquake named?',
 'What did the quake measure?',
 'What year did the Sichuan earthquake take place?',
 'How many people died?',
 'What day did the earthquake occur?',
 'In what year did the earthquake in Sichuan occur?',
 'What time of the day did the quake happen?',
 'How many people were killed as a result?',
 'What was the focal depth of the quake?',
 'How long after the earthquake were aftershocks felt?',
 'Which far away cities in other countries could feel the earthquake?',
 'Where was the epicenter of the earthquake?',
 'What is another name for the earthquake in Sichuan?',
 'How far was it from Chengdu?',
 'What was the focal depth of the earthquake?',
 'How many were injured in Sichuan?',
 'How many people lived in the affected area?',
 'How many people were confirmed dead?',
 'How many people were confirmed dead only in the Sichuan province?',
 'How many people are listed as missing?',
 'How many people are homeless because of the quake?',
 'How high could t