In [1]:
import numpy as np 
import pandas as pd
import json
from gensim.models.tfidfmodel import TfidfModel
import logging
from gensim import corpora, models, similarities
from time import time
from nltk.tokenize import word_tokenize, sent_tokenize
import operator
import scipy.stats as ss
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [3]:
def convert_json_data_frame(path):
    df_squad_train = pd.read_json(path)
    df_squad_train.rename(columns={'passages':'documents'}, inplace=True)
    # rearranging the dataframe such that it is easy to access (splitting from columns)
    df_squad_train_new = df_squad_train.groupby('title').documents.apply(lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis = 1)
    df_squad_train_new.rename(columns={'questions':'question_set'}, inplace=True)
    return df_squad_train_new

In [4]:
path = 'squad_train_doc.json'

In [8]:
data_train = convert_json_data_frame(path)

In [9]:
data_train_context =  data_train.groupby('title').context.sum()
data_train_questions = data_train.groupby('title').question_set.sum()

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def convert_to_words(data_train):
    data_train_context =  data_train.groupby('title').context.sum()
    data_train_questions = data_train.groupby('title').question_set.sum()
    contexts = []
    question = []
    question_temp = []
    for context, question in zip(range(0,len(data_train_context)), data_train_questions):
        words = [word for word in word_tokenize(data_train_context[context]) if word not in stop_words]
#         for each_ques in question:
#             question_temp = question_temp + [word for word in word_tokenize(each_ques) if word not in stop_words]
#         words = words+ question_temp
        contexts.append(words)
        
    return contexts

In [12]:
docs = convert_to_words(data_train)

In [13]:
def create_dict(docs):
    dictionary = corpora.Dictionary(docs)
    dictionary.save('/tmp/squad.dict') # store the dictionary, for future reference

In [14]:
dictionary = create_dict(docs)

In [15]:
def create_corpus(dictionary,docs):
    raw_corpus = [dictionary.doc2bow(each_doc) for each_doc in docs]
    corpora.MmCorpus.serialize('/tmp/squad.mm', raw_corpus) # store to disk


In [16]:
dictionary = corpora.Dictionary.load('/tmp/squad.dict')

In [17]:
create_corpus(dictionary,docs)

In [18]:

corpus = corpora.MmCorpus('/tmp/squad.mm')

In [19]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

In [37]:
tfidf.save('/tmp/squad.model')

In [20]:
corpus_tfidf = tfidf[corpus]

In [21]:
def create_similarity_matrix(corpus_tfidf):
    index = similarities.MatrixSimilarity(corpus_tfidf)
    index.save('/tmp/squad.index')

In [22]:
create_similarity_matrix(corpus_tfidf)

In [23]:
index = similarities.MatrixSimilarity.load('/tmp/squad.index')

In [24]:
def get_similarity_matrix(corpus_tfidf,index):
    return index[corpus_tfidf]

In [25]:
similarity_matrix =  get_similarity_matrix(corpus_tfidf, index)

In [26]:
def give_query(query):
    #print(query)
    index = similarities.MatrixSimilarity.load('/tmp/squad.index')
    query = (word_tokenize(query))
    query_1 = []
    query_1.append(query)
#     dictionary_query = corpora.Dictionary(query_1)
    raw_corpus_query = [dictionary.doc2bow(word) for word in query_1]
    corpora.MmCorpus.serialize('/tmp/query3.mm',raw_corpus_query)
    corpus_query = corpora.MmCorpus('/tmp/query3.mm')
#     for k in tfidf[corpus_query]:
#         print(k)
    similarity_table = index.get_similarities(tfidf[corpus_query])
#     similarity_table = similarities.MatrixSimilarity()
    ranks = ss.rankdata(similarity_table, method = 'max')
    #doc_pick, value = max(enumerate(similarity_table), key=operator.itemgetter(1))
    similarity_table = np.array(similarity_table)
    #print([i for i in np.argsort(similarity_table)[-2:]])
    #print(sorted(range(len(similarity_table)), key=lambda i: similarity_table[i])[-2:])
    doc_pick = similarity_table.argmax()
    #print(len(similarity_table[0]))
    #print(ranks)
    return similarity_table
    

In [27]:
query = """For which masterpiece did Cesare Beccaria become famous?"""
zz = give_query(query)

In [28]:
zz.argmax()

8

In [229]:
idx = (-zz).argsort()[:5]

In [237]:
idx[0][0]

115

In [None]:
data_train[144:156]

In [None]:
data_train.to_csv('data_train_idf')

In [29]:
def top_similiar(top_num, query):
    similarity_table = give_query(query)
    top_n_similiar = (-similarity_table).argsort()[:top_num]
    similiar_n = []
    for num in range(0,top_num):
        similiar_n.append(top_n_similiar[0][num])
    return similiar_n

In [46]:
def calc_accuracy(rank):
    index_cols = ['doc_num','question_num','doc_predicted']
    accuracy_data = pd.DataFrame(columns= index_cols)
    acc = {'doc_num':0,'doc_num_pred':None , 'question':None}
    
    for doc_num in range(0,len(data_train_questions)):
        print('doc_num', doc_num)
        for question in data_train_questions[doc_num]:
            doc_sim = top_similiar(rank, question)
            acc['doc_num_pred'] = doc_sim
            acc['question'] = question
            acc['doc_num'] = doc_num
            acc_data = pd.Series(data = [acc['doc_num'],acc['question'],acc['doc_num_pred']], index = index_cols)
            accuracy_data = accuracy_data.append(acc_data, ignore_index= True)
    return accuracy_data
    

In [63]:
accuracy_data = calc_accuracy(50)

doc_num 0
doc_num 1
doc_num 2
doc_num 3
doc_num 4
doc_num 5
doc_num 6
doc_num 7
doc_num 8
doc_num 9
doc_num 10
doc_num 11
doc_num 12
doc_num 13
doc_num 14
doc_num 15
doc_num 16
doc_num 17


KeyboardInterrupt: 

In [41]:
counter = 0
for index in range(len(accuracy_data.doc_predicted)):
    for count in range(len(accuracy_data.doc_predicted[index])):
        if(accuracy_data.doc_predicted[index][count] == accuracy_data.doc_num[index]):
            counter += 1
accuracy = counter/len(accuracy_data.doc_predicted)

In [42]:
accuracy

0.9008497935562088

In [40]:
accuracy_1_window = 0.53

In [38]:
accuracy_5_window = 0.8493589377008681

In [43]:
accuracy_10_window = 0.9008497935562088

In [None]:
accuracy_50_window = 

In [209]:
# Accuracy

In [49]:
def accuracy_doc(doc_number, question_set, top_num):
    acc = {'doc_num':doc_number,'doc_num_pred':None , 'question_num':None , 'question':None}
    question_num =  np.random.randint(0,len(question_set))
    doc_sim = top_similiar(top_num, question_set[question_num])
    acc['doc_num_pred'] = doc_sim
    acc['question_num'] = question_num
    acc['question'] = question_set[question_num]
    return acc 

In [50]:
def accuracy(data_train, top_num, iter_num):
    data_train_context =  data_train.groupby('title').context.sum()
    data_train_questions = data_train.groupby('title').question_set.sum()
    length_doc = len(data_train_context)
    index_cols = ['doc_num','question_num','question','doc_predicted']
    accuracy_data = pd.DataFrame(columns= index_cols)
    for iters in range(0, iter_num):
        doc_num =  np.random.randint(0,length_doc)
        question_set = data_train_questions[doc_num]
        acc = accuracy_doc(doc_num,question_set,top_num)
        acc_data = pd.Series(data = [acc['doc_num'],acc['question_num'], acc['question'],acc['doc_num_pred']], index = index_cols)
       # print(acc_data)
        accuracy_data = accuracy_data.append(acc_data, ignore_index= True)
        #print(accuracy_data)
    return accuracy_data

In [51]:
 rr = accuracy(data_train,5,100)

In [52]:
rr
    

Unnamed: 0,doc_num,question_num,question,doc_predicted
0,150.0,53.0,When was the Road to Serfdom published?,"[150, 338, 380, 408, 252]"
1,51.0,104.0,What is one of the great agricultural challeng...,"[169, 170, 195, 51, 384]"
2,270.0,201.0,Who was the videotape used as evidence against...,"[270, 297, 375, 42, 181]"
3,177.0,50.0,When did the PAIGC acknowledge the executions?,"[177, 69, 184, 268, 335]"
4,22.0,140.0,What effect do cages have on the spectrum of u...,"[308, 210, 309, 167, 334]"
5,430.0,47.0,In which year did Poland declassify most of it...,"[430, 151, 263, 390, 275]"
6,350.0,82.0,On what date were the Belavezha Accords signed?,"[350, 112, 217, 174, 252]"
7,2.0,34.0,What form of government do the Puerto Ricans h...,"[2, 378, 146, 335, 142]"
8,360.0,231.0,How does the Seattle ferry line compare to the...,"[360, 374, 401, 11, 311]"
9,384.0,84.0,What has been the result of attempts to connec...,"[384, 108, 351, 116, 378]"
