# Notebook to test out document selection
Using Squad DB stored in SQL server, First test out using gensim's LSI, then bm25 to see document retrieval rate and think of ideas to improve.  
## Experiment design
### 1. Dataset
The dataset used will be Squad train set, as stored in a hosted SQL server. All the clauses from each wikipedia article are re-aggregated into the same document. There are a total of 442 documents.
Queries are also from the same Squad train set. Query labels are the index of the document to which the question applies. There are 87599 queries.
### 2. Test method
Each method, along with any necessary hyperparameters will be tested on the dataset. Both accuracy and speed should be logged. In terms of metrics, Possibly accuracy @1 to @5. (What about percentile of correct answer!)
Will try to utilize multiple cores using multiprocessing module.
### 3. Test subjects
1. BM25
2. LSI with 2 topics
3. LSI with 20 topics
4. LSI with 200 topics

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')
from gensim import corpora, models, similarities
from collections import defaultdict
from src.dataloader import document_retrieval
from gensim.summarization import bm25

In [3]:
df_kb, df_query = document_retrieval('../db_cnxn_str.txt')

In [4]:
class LSIModel:
    def __init__(self, df, num_topics=2):
        self.ind2id = {ind:id for ind, id in zip(df.index,df['id'])}
        stoplist = set('for a of the and to in'.split())
        texts = [
            [word for word in document.lower().split() if word not in stoplist]
            for document in df['raw_txt']
        ]
        # remove words that appear only once
        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [
            [token for token in text if frequency[token] > 1]
            for text in texts
        ]

        self.dictionary = corpora.Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        self.lsi = models.LsiModel(self.corpus, id2word=self.dictionary, num_topics=num_topics)  
        self.index = similarities.MatrixSimilarity(self.lsi[self.corpus])  # transform corpus to LSI space and index it
        
    def predict(self, query):
        vec_bow = self.dictionary.doc2bow(query.lower().split())
        vec_lsi = self.lsi[vec_bow]  # convert the query to LSI space
        sims = self.index[vec_lsi]  # perform a similarity query against the corpus
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        return sims       

In [5]:
import pandas as pd
test_df = pd.DataFrame({'raw_txt':[
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
], 'id':[i for i in range(9)]})

test_lsimodel = LSIModel(test_df, num_topics=2)
# test_lsi, test_ind2id, test_dict, test_corpus = prep_model(test_df)
test_sims = test_lsimodel.predict('Human computer interaction')
# print(test_sims)
for ii, (i, s) in enumerate(test_sims):
    print(s, test_df['raw_txt'][i])

0.9984453 The EPS user interface management system
0.998093 Human machine interface for lab abc computer applications
0.9865886 System and human system engineering testing of EPS
0.93748635 A survey of user opinion of computer system response time
0.90755945 Relation of user perceived response time to error measurement
0.050041765 Graph minors A survey
-0.09879464 Graph minors IV Widths of trees and well quasi ordering
-0.10639259 The intersection graph of paths in trees
-0.12416792 The generation of random binary unordered trees


In [6]:
class MyBM25(bm25.BM25):
    def __init__(self, df):
        super().__init__(df['raw_txt'])
        self.ind2id = {ind:id for ind, id in zip(df.index,df['id'])}
    def predict(self, query):
        scores = self.get_scores(query)
        scores = sorted(enumerate(scores), key=lambda item: item[1])
        return scores
#         scores.index(max(scores))
#         display(df_kb[df_kb['id']==ind2id[scores.index(max(scores))]][['id', 'kb_name']])

In [7]:
def print_score(ans_ranks, counter):
    for acc in range(1,6):
        correct = sum(map(lambda x : x<acc, ans_ranks))
        print('accuracy @ {}: {:.4f}'.format(acc, correct/counter))
    print('ranking percentile {:.2f}'.format( 1-sum(ans_ranks)/len(ans_ranks)/448))
    
def scoring_function(df_query, modelobj):
    '''Return accuacy at 1 through 5 docs, and average rank of correct ans.'''
    counter=0
    ans_ranks=[]
    for _, row in df_query.sample(frac=1, random_state=42).iterrows():
        ranking = modelobj.predict(row['query_string'])
        ranked_doc_ids = [modelobj.ind2id[k] for k, v in ranking]
        ans_ranks.append(ranked_doc_ids.index(row['doc_id']))
        counter+=1
        if counter%20000==0:
            print_score(ans_ranks, counter)
    print('final score: ') 
    print_score(ans_ranks, counter)

            

In [8]:
models=[LSIModel(df_kb, num_topics=6000),
        LSIModel(df_kb, num_topics=4000),
        LSIModel(df_kb, num_topics=2000),
        LSIModel(df_kb, num_topics=200),
 LSIModel(df_kb, num_topics=20),
 LSIModel(df_kb, num_topics=2),
MyBM25(df_kb)]

In [9]:
for model in models:
    scoring_function(df_query, model)

accuracy @ 1: 0.2898
accuracy @ 2: 0.3513
accuracy @ 3: 0.3842
accuracy @ 4: 0.4088
accuracy @ 5: 0.4273
ranking percentile 0.88
accuracy @ 1: 0.2907
accuracy @ 2: 0.3529
accuracy @ 3: 0.3869
accuracy @ 4: 0.4101
accuracy @ 5: 0.4273
ranking percentile 0.88
accuracy @ 1: 0.2909
accuracy @ 2: 0.3529
accuracy @ 3: 0.3867
accuracy @ 4: 0.4100
accuracy @ 5: 0.4273
ranking percentile 0.88
accuracy @ 1: 0.2913
accuracy @ 2: 0.3538
accuracy @ 3: 0.3873
accuracy @ 4: 0.4106
accuracy @ 5: 0.4279
ranking percentile 0.88
final score: 
accuracy @ 1: 0.2914
accuracy @ 2: 0.3540
accuracy @ 3: 0.3875
accuracy @ 4: 0.4107
accuracy @ 5: 0.4280
ranking percentile 0.88
accuracy @ 1: 0.2898
accuracy @ 2: 0.3513
accuracy @ 3: 0.3842
accuracy @ 4: 0.4088
accuracy @ 5: 0.4273
ranking percentile 0.88
accuracy @ 1: 0.2907
accuracy @ 2: 0.3529
accuracy @ 3: 0.3869
accuracy @ 4: 0.4101
accuracy @ 5: 0.4273
ranking percentile 0.88
accuracy @ 1: 0.2909
accuracy @ 2: 0.3529
accuracy @ 3: 0.3867
accuracy @ 4: 0.4100