#### imports

In [25]:
import math
import lucene
import time
import itertools
import numpy as np
from tqdm import tqdm
from java.io import File
import xml.etree.ElementTree as ET
from collections import defaultdict
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import BytesRefIterator
from org.apache.lucene.index import DirectoryReader, Term
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher, BooleanQuery, BooleanClause, TermQuery, BoostQuery
from org.apache.lucene.search.similarities import BM25Similarity, LMJelinekMercerSimilarity, LMDirichletSimilarity
lucene.initVM()

<jcc.JCCEnv at 0x7f0dad7d4d50>

In [26]:
q_name = 'trec6'

In [27]:
index_path = '../index/'
topicFilePath = f'../{q_name}.xml'
qrel_file = '../trec678_robust.qrel'

directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

In [28]:
def query_topics(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    parsed_topics = {}

    for top in root.findall('top'):
        num = top.find('num').text.strip()
        title = top.find('title').text.strip()
        parsed_topics[num] = title

    return parsed_topics

In [29]:
def makeRelJudgeDict(qrelFilePath):
    # {qid1:{docid1:0/1,docid2:0/1,...}, qid2:{docid2:0/1,docid4:0/1,...},...}
    relJudgeDict = {}
    with open(qrelFilePath, 'r') as f:
        for line in f:
            l = line.split()
            qid, docid, judgement = l[0], l[2], int(l[3])
            if qid not in relJudgeDict:
                relJudgeDict[qid] = {docid: judgement}
            else:
                relJudgeDict[qid][docid] = judgement
    return relJudgeDict

def isTrueRelevant(qid, docid, relJudgeDict):
    # returns if the doc is True relevant, for the given query, according to the judgment file
    if qid not in relJudgeDict:
        return False
    if docid not in relJudgeDict[qid]:
        return False
    if relJudgeDict[qid][docid] == 1:   # 1 -> Relevant TRF
        return True
    if relJudgeDict[qid][docid] == 0:
        return False

def isTrueNonRelevant(qid, docid, relJudgeDict):
    # returns if the doc is NOT true relevant, for the given query, according to the judgment file
    if qid not in relJudgeDict:
        return False
    if docid not in relJudgeDict[qid]:
        return False
    if relJudgeDict[qid][docid] == 0:   # 0 -> Non-relevant TRF
        return True
    if relJudgeDict[qid][docid] == 1:
        return False

relJudgeDict = makeRelJudgeDict(qrel_file)

In [30]:
query_all = query_topics(topicFilePath)

In [31]:
def getDocumentVector(luceneDocid, indexReader):               
    
    docVec = {}
    D = 0                                 
    
    terms = indexReader.getTermVector(luceneDocid, 'CONTENTS')
    iterator = terms.iterator()
    for term in BytesRefIterator.cast_(iterator):
        t = term.utf8ToString()
        tf = iterator.totalTermFreq()  
        D += tf
        docVec[t] = tf
    
    docVec = {key: (value / D)  for key, value in docVec.items()}

    return docVec


### RM3


In [32]:
def search(indexReader, query, similarity, top_rel_doc, qid, tpd, tnd):
    analyzer = EnglishAnalyzer()
    searcher = IndexSearcher(indexReader)
    searcher.setSimilarity(similarity)
    query = QueryParser('CONTENTS', analyzer).escape(query)      # a few titles had '/' in them which 
    
    query = QueryParser("CONTENTS", analyzer).parse(query)

    scoreDocs = searcher.search(query, top_rel_doc).scoreDocs
    
    docids = [scoreDoc.doc for scoreDoc in scoreDocs]

    relevent_docs = []
    nonrel_docs = []
    for id in docids:
        doc = searcher.doc(id)
        if isTrueRelevant(qid, doc.get('ID'), relJudgeDict):
            relevent_docs.append(id)
        if isTrueNonRelevant(qid, doc.get('ID'), relJudgeDict):
            nonrel_docs.append(id)
   
    # print(qid,relevent_docs)
    docids = relevent_docs[:tpd]
    ndocid = nonrel_docs[:tnd]
    
    set_cont = {term for doc in docids for term in getDocumentVector(doc, indexReader).keys()}
    
    set_n = {term for doc in ndocid for term in getDocumentVector(doc, indexReader).keys()}
    
    query_terms_set = set([term.strip()[9:] for term in query.toString().split()])
    set_cont = set_cont.difference(set_n) | query_terms_set

    set_cont = {ele for ele in set_cont if ele.isalpha()}

    return set_cont, docids

In [33]:
def RM3_term_selection(Query, set_ET, docs, indexReader, alpha, mu, expanded_query_terms):
    
    totalTF = indexReader.getSumTotalTermFreq("CONTENTS")

    Q = Query.split()
    weight = {}

    cf = {}
    for t in set_ET | set(Q):
        T = Term("CONTENTS", t)
        cf[t] = indexReader.totalTermFreq(T)/totalTF

    docVectors = {}
    mixinglambda = {}
    
    for d in docs:                    
        docVectors[d] = getDocumentVector(d, indexReader)
        
    for d in docs:                  
        mixinglambda[d] = len(docVectors[d])/(len(docVectors[d]) + mu)
        
    for w in set_ET:
        p_wr = 0
        for d in docs:                  
            ml = mixinglambda[d]
            # p_wd = (ml*(docVectors[d].get(w,0)) + (1 - ml)*cf[w]) 
            p_wd = docVectors[d].get(w,0)     
        
            p_q = 1
            for q in Q:
                # p_q = p_q*docVectors[d].get(q,0)   
                p_q = p_q*(ml*(docVectors[d].get(q,0)) + (1 - ml)*cf[q])   

            p_wr = p_wr + p_wd*p_q
        weight[w] = p_wr


    weight = dict(sorted(weight.items(), key=lambda x:x[1], reverse=True)[:expanded_query_terms])
    
    norm = sum(weight.values())
    weight = {w:weight[w]/norm for w in weight}
 
    for w in weight.keys() | set(Q):
        weight[w] = (alpha*weight.get(w,0)) + (1-alpha)*(Q.count(w)/len(Q))
  

    temp_list = sorted(weight.items(), key=lambda x:x[1], reverse=True)
    sorted_weights = dict(temp_list)

    return sorted_weights

In [34]:
def expanded_query_BM25(search, RM3_term_selection, k1, b, alpha, top_rel_doc, expanded_query_terms, mu, tpd, tnd):

    analyzer = EnglishAnalyzer()
    similarity = BM25Similarity(k1,b)
    expanded_q = []

    i = 0
    # for q in tqdm(query_all.values(), colour='red', desc='Expanding Queries'):
    for qid, q in query_all.items():
     
        i += 1 
        escaped_q = QueryParser('CONTENTS', analyzer).escape(q)      # a few titles had '/' in them which 
        query = QueryParser('CONTENTS', analyzer).parse(escaped_q)
        
        query_terms = [term.strip()[9:] for term in query.toString().split()]
        parsed_q = ' '.join(query_terms)
#         print(parsed_q)
        
        expension_term_set, docids = search(indexReader, q, similarity, top_rel_doc, qid, tpd, tnd)
        weights = RM3_term_selection(parsed_q, expension_term_set, docids, indexReader, alpha, mu, expanded_query_terms)
    
        # print(weights.keys())    
        booleanQuery = BooleanQuery.Builder()
        for m, n in weights.items():
            t = Term('CONTENTS', m)
            tq = TermQuery(t)
            boostedTermQuery = BoostQuery(tq, float(n))
            BooleanQuery.setMaxClauseCount(4096)
            booleanQuery.add(boostedTermQuery, BooleanClause.Occur.SHOULD)
        booleanQuery = booleanQuery.build()
       
        expanded_q.append(booleanQuery)   

    return expanded_q

In [35]:
def search_retrived(indexReader, Query, Qid, similarity, out_name):

    searcher = IndexSearcher(indexReader)
    searcher.setSimilarity(similarity)
   
    scoreDocs = searcher.search(Query, 1000).scoreDocs             #retrieving top 1000 relDoc
    i = 1
    res = ''

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        r = str(Qid) + '\t' + 'Q0' + '\t' + str(doc.get('ID')) + '\t' + str(i) + '\t' + str(scoreDoc.score) + '\t' + str(out_name) + '\n'
        res += r
        i = i+1   

    return res

In [36]:
def run_RM3(top_PRD, expanded_query_terms, alpha, mu,tpd, tnd):
    expand_q = expanded_query_BM25(search, RM3_term_selection, k1, b, alpha, top_PRD, expanded_query_terms, mu, tpd, tnd)
                                       
    name = 'prm_'
    sim = BM25Similarity(k1,b)
    name = name + 'BM25_' + str(k1) + '_'+ str(b)

    file_name = f'./res_TRF/{q_name}/{tpd}_{tnd}_{q_name}_mu_' + str(mu) +'_docs_' + str(top_PRD) + '_terms_' + str(expanded_query_terms) + '_alpha_' + str(alpha) +'_tf_' +'.txt'
    out_file = open(file_name, "w")

    res = ''
    # for i in tqdm(range(len(query_all)),colour='cyan', desc = 'Re-retrival'):
    for i in range(len(query_all)):
    
        result =  search_retrived(indexReader, expand_q[i], list(query_all.keys())[i], sim, name)
        res = res + result

    out_file.write(res)
    out_file.close()
    # print("Retrieval Completed - result dumped in", file_name)

In [48]:
k1 = 0.8
b = 0.4

tpd = [20]
tnd = [5]
top_PRD = [1000]
expanded_query_terms = [90]
alpha = [0.8]
mu = [30]

parameters = list(itertools.product(top_PRD, expanded_query_terms, alpha, mu, tpd, tnd))

for num_doc, num_q, alpha, mu, tpd, tnd in tqdm(parameters, colour='red'):
    run_RM3(num_doc, num_q, alpha, mu, tpd, tnd)

100%|[31m██████████[0m| 1/1 [00:21<00:00, 21.54s/it]


In [38]:
searcher = IndexSearcher(indexReader)
searcher.doc(317037)
searcher.setSimilarity(BM25Similarity(0.8,0.4))
query = query_all['344']
analyzer = EnglishAnalyzer()
query = QueryParser("CONTENTS", analyzer).parse(query)
print(query)
doc = searcher.search(query, 1000).scoreDocs


CONTENTS:abus CONTENTS:e CONTENTS:mail


In [39]:
dd = []
for d in doc:
    dd.append(d.doc)

In [40]:
dd

[238581,
 420113,
 513561,
 455901,
 300468,
 520799,
 126046,
 260886,
 485051,
 289131,
 256376,
 290663,
 222820,
 239870,
 323875,
 221533,
 521967,
 239865,
 239868,
 390773,
 103832,
 317750,
 239869,
 159448,
 239871,
 256366,
 256392,
 232612,
 239867,
 455690,
 198699,
 387242,
 453048,
 454212,
 239866,
 292123,
 455308,
 498291,
 198547,
 476081,
 401652,
 240921,
 527538,
 124189,
 244398,
 121242,
 76923,
 300135,
 269747,
 217862,
 300140,
 266703,
 456029,
 264089,
 293428,
 243148,
 220183,
 266118,
 303975,
 462077,
 447716,
 508743,
 481760,
 274143,
 204710,
 266073,
 292161,
 232611,
 268388,
 216972,
 137565,
 297450,
 221858,
 479663,
 265883,
 302341,
 242686,
 389301,
 436736,
 528062,
 258651,
 293450,
 249865,
 378000,
 476146,
 388431,
 236890,
 131191,
 222232,
 462855,
 293401,
 260825,
 104681,
 271955,
 219910,
 297044,
 429217,
 290662,
 293402,
 293776,
 447488,
 449196,
 238248,
 317037,
 164552,
 195039,
 270032,
 427903,
 348613,
 324444,
 273140,
 2

In [41]:
rd = 'LA120190-0068'
q = QueryParser('ID', analyzer).parse(rd)
print(q)
scoreDocs = searcher.search(q, 1).scoreDocs

ID:la120190 ID:0068


In [42]:
scoreDocs

JArray<object>[<ScoreDoc: doc=517246 score=8.223074 shardIndex=0>]

In [43]:
searcher.doc(517246)

<Document: Document<stored,indexed,tokenized,termVector<ID:LA120190-0068> stored,indexed,tokenized,termVector<CONTENTS: 315984 


December 1, 1990, Saturday, Orange County Edition 




Metro; Part B; Page 10; Column 3; Metro Desk 




327 words 




ORANGE COUNTY PERSPECTIVE; 


PARDON MY COMPUTER MESSAGE 




Common sense dictates that anyone using electronic mail be aware that computer 
messages could be read by third parties. To keep communication truly private, 
use the telephone, or better yet, say it in person. 


But Costa Mesa City Manager Allan L. Roeder and Police Chief David L. Snowden 
are justified in being concerned that computer messages exchanged on a variety 
of personal and professional subjects were made public. To their embarrassment, 
they found that a former city police lieutenant tapped their electronic mail 
and shared it with a city councilman, and that apparently included some 
off-color and sexist jokes -- inappropriate in any medium. 


Snowden subsequently 

In [44]:
isTrueRelevant('344', 517246, relJudgeDict)

False

In [45]:
relJudgeDict['344']

{'FBIS3-11392': 0,
 'FBIS3-14832': 0,
 'FBIS3-21006': 0,
 'FBIS3-21177': 0,
 'FBIS3-2240': 0,
 'FBIS3-23': 0,
 'FBIS3-23757': 0,
 'FBIS3-23823': 0,
 'FBIS3-23945': 0,
 'FBIS3-23947': 0,
 'FBIS3-24653': 0,
 'FBIS3-24656': 0,
 'FBIS3-24678': 0,
 'FBIS3-24680': 0,
 'FBIS3-24980': 0,
 'FBIS3-2516': 0,
 'FBIS3-2798': 0,
 'FBIS3-29180': 0,
 'FBIS3-30429': 0,
 'FBIS3-34527': 0,
 'FBIS3-36078': 0,
 'FBIS3-37944': 0,
 'FBIS3-37947': 0,
 'FBIS3-38997': 0,
 'FBIS3-39579': 0,
 'FBIS3-40468': 0,
 'FBIS3-40494': 0,
 'FBIS3-40497': 0,
 'FBIS3-4209': 0,
 'FBIS3-42726': 0,
 'FBIS3-42982': 0,
 'FBIS3-43020': 0,
 'FBIS3-43132': 0,
 'FBIS3-43186': 0,
 'FBIS3-45657': 0,
 'FBIS3-46596': 0,
 'FBIS3-46614': 0,
 'FBIS3-48422': 0,
 'FBIS3-5103': 0,
 'FBIS3-51561': 0,
 'FBIS3-51695': 0,
 'FBIS3-52211': 0,
 'FBIS3-52643': 0,
 'FBIS3-55906': 0,
 'FBIS3-58': 0,
 'FBIS3-58323': 0,
 'FBIS3-59494': 0,
 'FBIS3-59560': 0,
 'FBIS3-59587': 0,
 'FBIS3-59758': 0,
 'FBIS3-59972': 0,
 'FBIS3-7604': 0,
 'FBIS3-8249': 0,
 'FBIS

In [46]:
87 63 10 105
4
134
149
4
1
6
144
11
86
6
32
26
12
10
57
6
118
3
33
117
19
44
11
8
22
55
97
18
57
4
47
3
93
4
10
30
63
10
47
4
28
48
80
5
35
24


SyntaxError: invalid syntax (571824219.py, line 1)

In [None]:
true_doc = []

SyntaxError: invalid syntax (440626854.py, line 2)

In [None]:
f = open('index_true.txt', 'r')

In [None]:
i = 0
for line in f:
    i = i + int(line)

In [None]:
i

2090