In [74]:
import math
import lucene
import time
import itertools
import numpy as np
from tqdm import tqdm
from java.io import File
import xml.etree.ElementTree as ET
from collections import defaultdict
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import BytesRefIterator
from org.apache.lucene.index import DirectoryReader, Term
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher, BooleanQuery, BooleanClause, TermQuery, BoostQuery
from org.apache.lucene.search.similarities import BM25Similarity, LMJelinekMercerSimilarity, LMDirichletSimilarity
lucene.initVM()

<jcc.JCCEnv at 0x7f8403539070>

In [75]:
q_name = 'trec6'

In [76]:
index_path = '../../index/'
topicFilePath = f'../../{q_name}.xml'

directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

In [77]:
def query_topics(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    parsed_topics = {}

    for top in root.findall('top'):
        num = top.find('num').text.strip()
        title = top.find('title').text.strip()
        parsed_topics[num] = title

    return parsed_topics

In [78]:
query_all = query_topics(topicFilePath)

In [79]:
def getDocumentVector(luceneDocid, indexReader):

    N = indexReader.numDocs()                   
    
    docVec = defaultdict(lambda: [0, 0]) 
    D = 0                                 
    
    terms = indexReader.getTermVector(luceneDocid, 'CONTENTS')
    iterator = terms.iterator()
    for term in BytesRefIterator.cast_(iterator):
        t = term.utf8ToString()
        tf = iterator.totalTermFreq()  
        df = indexReader.docFreq(Term('CONTENTS', t))  
        D += tf
        docVec[t][0] = tf
        docVec[t][1] = df
    
    docVec = {key: (value[0] / D) * math.log(N / (value[1] + 1)) for key, value in docVec.items()}
    
    total_weight = sum(docVec.values())
    docVec = {key: value / total_weight for key, value in docVec.items()}

    # print(len(docVec), "D", D)
    return docVec, D


In [80]:
# terms = indexReader.getTermVector(1, 'CONTENTS')
# terms.getStats()

In [81]:
def search(indexReader, query, similarity, top_rel_doc):
    analyzer = EnglishAnalyzer()
    searcher = IndexSearcher(indexReader)
    searcher.setSimilarity(similarity)
    # query = QueryParser("CONTENTS", analyzer).parse(query)

    scoreDocs = searcher.search(query, top_rel_doc).scoreDocs
    
    docids = [scoreDoc.doc for scoreDoc in scoreDocs]

    set_cont = set()
    set_cont = {term for doc in docids for term in getDocumentVector(doc, indexReader)[0].keys()}

    filtered_tok = set()
    for tok in set_cont:
        if tok.isalpha():
            filtered_tok.add(tok)

    # N = indexReader.numDocs()  
    # new_set = []
    # for t in set_cont:
    #     df = indexReader.docFreq(Term('CONTENTS', t)) 
    #     if df/N < 0.1:
    #         new_set.append(t)
            
    # print('Old Set:', len(set_cont))
    # print('New Set:', len(new_set))

    # return set_cont, docids
    return filtered_tok, docids

In [82]:
# def RM3_term_selection(Query, set_ET, docs, indexReader, alpha, mu):
    
#     totalTF = indexReader.getSumTotalTermFreq("CONTENTS")

#     Q = Query.split()
#     weight = {}

#     cf = {}
#     for t in set_ET:
#         T = Term("CONTENTS", t)
#         cf[t] = indexReader.totalTermFreq(T)/totalTF
#     for q in Q:
#         set_ET.add(q)
#         T = Term("CONTENTS", q)
#         cf[q] = indexReader.totalTermFreq(T)/totalTF

#     docVectors = {}
#     mixinglambda = {}
#     doclength = {}
    
#     for d in docs:                    
#         docVectors[d], doclength[d] = getDocumentVector(d, indexReader)
        
#     for d in docs:                  
#         mixinglambda[d] = doclength[d]/(doclength[d] + mu)
        
#     for w in set_ET:
#         p_wr = 0
#         for d in docs:                  
#             ml = mixinglambda[d]
#             p_wd = (ml*(docVectors[d].get(w,0)) + (1 - ml)*cf[w])      
#             # p_wd = (docVectors[d].get(w,0))      

#             p_q = 1
#             for q in Q:
#                 p_q = p_q*(ml*(docVectors[d].get(q,0)) + (1 - ml)*cf[q])          

#             p_wr = p_wr + p_wd*p_q
#         weight[w] = p_wr

#     norm = sum(weight.values())
    
#     if norm == 0:
#         print(Q,'\n\n')
#     else:
#         weight = {w:weight[w]/norm for w in weight}

#     for w in set_ET:
#         weight[w] = (alpha*weight[w]) + (1-alpha)*(Q.count(w)/len(Q))

#     temp_list = sorted(weight.items(), key=lambda x:x[1], reverse=True)
#     sorted_weights = dict(temp_list)

#     return sorted_weights

In [83]:
def RM3_term_selection(Query, set_ET, docs, indexReader, alpha, mu, expanded_query_terms):
    
    totalTF = indexReader.getSumTotalTermFreq("CONTENTS")

    Q = Query.split()
    weight = {}

    cf = {}
    for t in set_ET | set(Q):
        T = Term("CONTENTS", t)
        cf[t] = indexReader.totalTermFreq(T)/totalTF

    docVectors = {}
    mixinglambda = {}
    doclength = {}
    
    for d in docs:                    
        docVectors[d], doclength[d] = getDocumentVector(d, indexReader)
        
    for d in docs:                  
        mixinglambda[d] = doclength[d]/(doclength[d] + mu)
        
    for w in set_ET:
        p_wr = 0
        for d in docs:                  
            ml = mixinglambda[d]
            # p_wd = (ml*(docVectors[d].get(w,0)) + (1 - ml)*cf[w]) 
            p_wd = docVectors[d].get(w,0)     
        
            p_q = 1
            for q in Q:
                # p_q = p_q*docVectors[d].get(q,0)   
                      
                p_q = p_q*(ml*(docVectors[d].get(q,0)) + (1 - ml)*cf[q])   


            p_wr = p_wr + p_wd*p_q
        weight[w] = p_wr



    weight = dict(sorted(weight.items(), key=lambda x:x[1], reverse=True)[:expanded_query_terms])
    
    norm = sum(weight.values())
    if norm == 0:
        pass
    else:
        weight = {w:weight[w]/norm for w in weight}


 
    for w in weight.keys() | set(Q):
        weight[w] = round((alpha*weight.get(w,0)) + (1-alpha)*(Q.count(w)/len(Q)), 4)
   

    temp_list = sorted(weight.items(), key=lambda x:x[1], reverse=True)
    sorted_weights = dict(temp_list)

    return sorted_weights

In [84]:
# def expanded_query_BM25(search, RM3_term_selection, k1, b, alpha, top_rel_doc, expanded_query_terms, mu):

#     analyzer = EnglishAnalyzer()
#     similarity = BM25Similarity(k1,b)
#     expanded_q = []

#     i = 0
#     for q in tqdm(query_all.values(), colour='red', desc='Expanding Queries'):
#     # for q in query_all.values():
     
#         i += 1 
#         escaped_q = QueryParser('CONTENTS', analyzer).escape(q)      # a few titles had '/' in them which 
#         query = QueryParser('CONTENTS', analyzer).parse(escaped_q)
        
#         query_terms = [term.strip()[9:] for term in query.toString().split()]
#         parsed_q = ' '.join(query_terms)
# #         print(parsed_q)
        
#         expension_term_set, docids = search(indexReader, parsed_q, similarity, top_rel_doc)
#         # expension_term_set, docids = search(indexReader, query, similarity, top_rel_doc)

#         weights = RM3_term_selection(parsed_q, expension_term_set, docids, indexReader, alpha, mu)
#         query_len = len(query_terms)
#         # query_len = 0
        
#         expanded_query_terms_list = list(weights.keys())[0:expanded_query_terms + query_len]
#         expanded_query_w = list(weights.values())[0:expanded_query_terms + query_len]
        
#         norm = sum(expanded_query_w)
#         expanded_query_weights = list(np.array(expanded_query_w)/norm)
    
#         booleanQuery = BooleanQuery.Builder()
#         for m in range(expanded_query_terms + query_len):
#             t = Term('CONTENTS', expanded_query_terms_list[m])
#             tq = TermQuery(t)
#             boostedTermQuery = BoostQuery(tq, float(expanded_query_weights[m]))
#             BooleanQuery.setMaxClauseCount(4096)
#             booleanQuery.add(boostedTermQuery, BooleanClause.Occur.SHOULD)
#         booleanQuery = booleanQuery.build()
       
#         expanded_q.append(booleanQuery)   

#     return expanded_q

In [85]:
import pprint
def expanded_query_BM25(search, RM3_term_selection, k1, b, alpha, top_rel_doc, expanded_query_terms, mu):

    analyzer = EnglishAnalyzer()
    similarity = BM25Similarity(k1,b)
    expanded_q = []

    i = 0
    # for q in tqdm(query_all.values(), colour='red', desc='Expanding Queries'):
    for q in query_all.values():
     
        i += 1 
        escaped_q = QueryParser('CONTENTS', analyzer).escape(q)      # a few titles had '/' in them which 
        query = QueryParser('CONTENTS', analyzer).parse(escaped_q)
        
        query_terms = [term.strip()[9:] for term in query.toString().split()]
        parsed_q = ' '.join(query_terms)
#         
        
        # expension_term_set, docids = search(indexReader, parsed_q, similarity, top_rel_doc)
        expension_term_set, docids = search(indexReader, query, similarity, top_rel_doc)
        weights = RM3_term_selection(parsed_q, expension_term_set, docids, indexReader, alpha, mu, expanded_query_terms)
        print(i, q)
        pprint.pprint(weights)
    
        booleanQuery = BooleanQuery.Builder()
        for m, n in weights.items():
            t = Term('CONTENTS', m)
            tq = TermQuery(t)
            boostedTermQuery = BoostQuery(tq, float(n))
            BooleanQuery.setMaxClauseCount(4096)
            booleanQuery.add(boostedTermQuery, BooleanClause.Occur.SHOULD)
        booleanQuery = booleanQuery.build()
       
        expanded_q.append(booleanQuery)   

    return expanded_q

In [86]:
def search_retrived(indexReader, Query, Qid, similarity, out_name):

    searcher = IndexSearcher(indexReader)
    searcher.setSimilarity(similarity)
   
    scoreDocs = searcher.search(Query, 1000).scoreDocs             #retrieving top 1000 relDoc
    i = 1
    res = ''

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        r = str(Qid) + '\t' + 'Q0' + '\t' + str(doc.get('ID')) + '\t' + str(i) + '\t' + str(scoreDoc.score) + '\t' + str(out_name) + '\n'
        res += r
        i = i+1   

    return res

In [87]:
def run_RM3(top_PRD, expanded_query_terms, alpha, mu):
    expand_q = expanded_query_BM25(search, RM3_term_selection, k1, b, alpha, top_PRD, expanded_query_terms, mu)
                                       
    name = 'prm_'
    sim = BM25Similarity(k1,b)
    name = name + 'BM25_' + str(k1) + '_'+ str(b)

    file_name = f'./res_RM3/{q_name}/{q_name}_mu_' + str(mu) +'_docs_' + str(top_PRD) + '_terms_' + str(expanded_query_terms) + '_alpha_' + str(alpha) + '.txt'
    out_file = open(file_name, "w")

    res = ''
    for i in tqdm(range(len(query_all)),colour='cyan', desc = 'Re-retrival'):
    # for i in range(len(query_all)):
    
        result =  search_retrived(indexReader, expand_q[i], list(query_all.keys())[i], sim, name)
        res = res + result

    out_file.write(res)
    out_file.close()
    # print("Retrieval Completed - result dumped in", file_name)

In [88]:
k1 = 0.8
b = 0.4

top_PRD = [25]
expanded_query_terms = [10]
alpha = [0.7]
mu = [750]

parameters = list(itertools.product(top_PRD, expanded_query_terms, alpha, mu))

for num_doc, num_q, alpha, mu in tqdm(parameters, colour='red'):
    run_RM3(num_doc, num_q, alpha, mu)

  0%|[31m          [0m| 0/1 [00:00<?, ?it/s]

1 International Organized Crime
{'affair': 0.0369,
 'crime': 0.2914,
 'crimin': 0.0801,
 'edict': 0.0322,
 'feder': 0.065,
 'fight': 0.03,
 'intern': 0.1329,
 'ministri': 0.0355,
 'organ': 0.2001,
 'russian': 0.0958}
2 Poliomyelitis and Post-Polio
{'case': 0.0472,
 'diseas': 0.0615,
 'erad': 0.0322,
 'gujarat': 0.0349,
 'health': 0.0641,
 'hemispher': 0.0444,
 'polio': 0.3296,
 'poliomyel': 0.1602,
 'post': 0.1,
 'vaccin': 0.0849,
 'viru': 0.0409}
3 Hubble Telescope Achievements
{'achiev': 0.1,
 'flaw': 0.0301,
 'hubbl': 0.2434,
 'mirror': 0.0752,
 'nasa': 0.0658,
 'optic': 0.0373,
 'shuttl': 0.0257,
 'space': 0.0589,
 'telescop': 0.2788,
 'test': 0.0507,
 'truli': 0.0341}
4 Endangered Species (Mammals)
{'endang': 0.1504,
 'fish': 0.0372,
 'fisheri': 0.0846,
 'incident': 0.055,
 'mammal': 0.2467,
 'marin': 0.1057,
 'nmf': 0.0562,
 'permit': 0.033,
 'speci': 0.193,
 'vessel': 0.0383}
5 Most Dangerous Vehicles
{'brake': 0.0553,
 'danger': 0.1,
 'employe': 0.0523,
 'fleet': 0.0503,
 'mach

Re-retrival: 100%|[36m██████████[0m| 50/50 [00:05<00:00,  8.45it/s]
100%|[31m██████████[0m| 1/1 [00:41<00:00, 41.34s/it]
