## Rocchio & Retrieval

### Imports and data path

In [1]:
import lucene
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.search.similarities import LMJelinekMercerSimilarity
from org.apache.lucene.search.similarities import LMDirichletSimilarity
from org.apache.lucene.analysis.en import EnglishAnalyzer
from java.io import File

from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause
from org.apache.lucene.search import TermQuery
from org.apache.lucene.search import BoostQuery
from org.apache.lucene.index import Term
from org.apache.lucene.util import BytesRefIterator
# run this again if VM is not initialized already
lucene.initVM()


<jcc.JCCEnv at 0x7f58b2991bf0>

In [2]:
q_name = 'trec6'
topicFilePath = f"../../{q_name}.xml"  # 50 queries

In [3]:
import xml.etree.ElementTree as ET

tree = ET.parse(topicFilePath)
topics = tree.getroot()

In [4]:
index_path = '../../TREC678/documents_index/'
directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

### Rocchio

In [5]:
import math
FIELDNAME = 'CONTENTS'       # Lucene index field name


# calculating avgdl for queries. Used in BM25_query().
analyzer = EnglishAnalyzer()
query_lens = []
for topic in topics:
    queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'
    q = topic.find(queryKeywordsField).text.strip()
    
    escaped_q = QueryParser(FIELDNAME, analyzer).escape(q)      # a few titles had '/' in them which
    # EnglishAnalyzer was not able to parse
    # without escaping those special characters
    query = QueryParser(FIELDNAME, analyzer).parse(escaped_q)
    query_terms = [term.strip()[len(FIELDNAME)+1:]
                   for term in query.toString().split()]
    query_lens.append(len(query_terms))
avgdl_query = sum(query_lens)/len(query_lens)

# calculating avgdl for the corpus. Used in BM25_docVec().
N = indexReader.numDocs()
avgdl_collection = indexReader.getSumTotalTermFreq(FIELDNAME)/N


In [6]:
print(indexReader.getSumTotalTermFreq(FIELDNAME))


188624297


In [7]:
def tf_idf_query(term, query_terms):
    # returns TF-IDF weight for the given term in query
    D = len(query_terms)
    N = indexReader.numDocs()
    tf = query_terms.count(term)
    df = indexReader.docFreq(Term(FIELDNAME, term))
    weight = (tf/D)*(math.log(N/(df+1)))
    return weight

def tf_idf_docVec(docVec, D):
    # tf-idf weight calculation for all the terms in the document vector
    N = indexReader.numDocs()       # no. of total docs in the corpus
    for t in docVec:
        tf = docVec[t][0]
        df = docVec[t][1]
        idf = math.log(N/(df+1))
        docVec[t] = (tf/D)*idf
    return docVec

def BM25_query(term, query_terms, k1=0.8, b=0.4):
    # returns Okapi BM25 weight for the given term in query
    D = len(query_terms)
    N = indexReader.numDocs()
    tf = query_terms.count(term)
    df = indexReader.docFreq(Term(FIELDNAME, term))
    idf = math.log(1+((N-df+0.5)/(df+0.5)))
    weight = ((tf*(1+k1))/(tf+k1*((1-b)+(b*D/avgdl_query))))*idf
    return weight

def BM25_docVec(docVec, D, k1=0.8, b=0.4):
    # Okapi BM25 weight calculation for all the terms in the document vector
    N = indexReader.numDocs()       # no. of total docs in the corpus
    for t in docVec:
        tf = docVec[t][0]
        df = docVec[t][1]
        idf = math.log(1+((N-df+0.5)/(df+0.5)))
        docVec[t] = ((tf*(1+k1))/(tf+k1*((1-b)+(b*D/avgdl_collection))))*idf    
    return docVec

def getDocumentVector(luceneDocid, weightScheme):
    # returns document vector in dictionary form with tf-idf weights
    docVec = {}                     # doc vector, which will have terms as keys and 
                                    # its tf-idf weight in the doc as values
    
    D = 0                           # doc length, i.e., total no. of tokens in the doc
    terms = indexReader.getTermVector(luceneDocid, FIELDNAME)
    iterator = terms.iterator()
    for term in BytesRefIterator.cast_(iterator):
        t = term.utf8ToString()
        tf = iterator.totalTermFreq()    # termFreq of term,t
        df = indexReader.docFreq(Term(FIELDNAME, t))    # docFreq of term,t
        D += tf
        docVec[t] = [tf,df]
        
    if weightScheme == 'TFIDF':
        docVec = tf_idf_docVec(docVec, D)
    elif weightScheme == 'BM25':
        docVec = BM25_docVec(docVec, D)
    
    return docVec

In [8]:
def rocchio_PRF(query, top_k_docs, N, alpha, beta, weightScheme):
    """Implements Rocchio's relevance feedback and returns a modified query

    Args:
        query (org.apache.lucene.search.Query): lucene parsed version of the initial/original query
        top_k_docs (lucene._lucene.JArray_object): scoreDocs returned after performing search with top k results
        N (int): number of terms to be in the returned modified query
        alpha (float): weight for original query
        beta (float): weight for positive feedback
        weightScheme (string): TFIDF or BM25 for term weighting

    Returns:
        list: expanded/modified query list of string query terms
    """

    # processing JQuery object to extract query terms in form of a list
    query_terms = [term.strip()[len(FIELDNAME)+1:]
                   for term in query.toString().split()]

    # creating query vector Q0
    Q0_vector = {}
    for term in query_terms:
        if weightScheme == 'TFIDF':
            Q0_vector[term] = tf_idf_query(term, query_terms)
        elif weightScheme == 'BM25':
            Q0_vector[term] = BM25_query(term, query_terms)

    sumRelDocsVector = {}     # Rel for Relevant, NRel for Non-relevant
    numRel = 0
    for scoreDoc in top_k_docs:
        docVec = getDocumentVector(scoreDoc.doc, weightScheme)
        numRel += 1
        # vector addition of sumRelDocsVector and docVec
        for term in docVec:
            if term in sumRelDocsVector:
                sumRelDocsVector[term] += docVec[term]
            else:
                sumRelDocsVector[term] = docVec[term]

    # normlaized Relevant Docs Vector
    r = {term: sumRelDocsVector[term]/numRel for term in sumRelDocsVector}

    # final Rocchio formula for Qm
    expanded_query = [
        [term, alpha*Q0_vector.get(term, 0) + beta*r.get(term, 0)] for term in set(Q0_vector) | set(r)]

    # sorted (descending) the expanded query list as per term scores
    expanded_query.sort(key=lambda x: x[1], reverse=True)
    # selecting top N expanded query terms
    Qm_with_scores = expanded_query[:int(N)]

    # weighting expanded query terms
    booleanQuery = BooleanQuery.Builder()
    for item in Qm_with_scores:
        t = Term(FIELDNAME, item[0])
        tq = TermQuery(t)
        boostedTermQuery = BoostQuery(tq, item[1])
        BooleanQuery.setMaxClauseCount(4096)
        booleanQuery.add(boostedTermQuery, BooleanClause.Occur.SHOULD)
    modifiedQuery = booleanQuery.build()

    return modifiedQuery   # modified query


### BM25 + Rocchio Retrieval

In [9]:
def bm25_rocchio(numPRD, N, alpha, beta, weightScheme='TFIDF'):
    """ Performs bm25 search with Rocchio pseudo relevance feedback 
        on a set of queries and output the result in a file

    Args:
        numPRD: no. of pseudo relevant docs
        N: no. of expansion terms
        alpha, beta: Rocchio model parameters
        weightScheme (string): TFIDF or BM25 for term weighting
        
    Returns:
        None
    """

    model = 'bm25'
#     LAMBDA = 0.4   # LM-JM baseline lambda parameter
#     similarityModel = LMJelinekMercerSimilarity(LAMBDA)

    k1 = 0.8
    b = 0.4
    similarityModel = BM25Similarity(k1, b)

    # change result file path below
    if weightScheme == 'BM25' or weightScheme == 'TFIDF':
        rocchioOutputPath = f"./Rocchio_output/{weightScheme}/{q_name}/{q_name}_BM25_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
    else:
        print('Warning: weightScheme entered not a valid parameter value. Taking default weightScheme: TFIDF')
        weightScheme = 'TFIDF'
        rocchioOutputPath = f"./Rocchio_output/{weightScheme}/{q_name}/{q_name}_BM25_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"

    f = open(rocchioOutputPath, 'w')

    # setting up the searcher
    analyzer = EnglishAnalyzer()    # used same analyzer as indexer
#     index_path = './index/'
    index = index_path
    directory = FSDirectory.open(File(index_path).toPath())
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # setting the similarity model
    searcher.setSimilarity(similarityModel)

    # print('\nRetrieving ...')

    # search on 50 queries from the topic file 'trec6.xml'
    for topic in topics:
        qidField = 'num'
        queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'

        qid = topic.find(qidField).text.strip()
        q = topic.find(queryKeywordsField).text.strip()

        escaped_q = QueryParser(FIELDNAME, analyzer).escape(
            q)      # a few titles had '/' in them which
        # EnglishAnalyzer was not able to parse
        # without escaping those special characters
        query = QueryParser(FIELDNAME, analyzer).parse(escaped_q)

        # print(f'Rocchio {weightScheme}, numPRD = {numPRD}, N = {N}, alpha = {alpha}, beta = {beta}; qid = {qid}, retrieving & writing ...', end=' ')

        # getting the top pseudo relevant docs using the searcher
        scoreDocs = searcher.search(query, numPRD).scoreDocs

        # Rocchio expanded query retrieval
        modified_query = rocchio_PRF(
            query, scoreDocs, N=N, alpha=alpha, beta=beta, weightScheme=weightScheme)

        # getting the top k search results using the searcher
        k = 1000
        scoreDocs = searcher.search(modified_query, k).scoreDocs

        # writing all k doc results in a .res file in TREC format
        rank = 0
        results = ''
        for scoreDoc in scoreDocs:
            rank += 1
            doc = searcher.doc(scoreDoc.doc)
            # f.write(f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_lmjm_{LAMBDA}-rocchio_{alpha}_{beta}\n")
            results += f"{qid}\tQ0\t{doc.get('ID')}\t{rank}\t{scoreDoc.score}\tBM25_{k1}-{b}-rocchio_{alpha}_{beta}\n"

        f.write(results)

        # print('complete!')

    f.close()
    # print('Search completed! Search results exported to a .res file in the current directory.\n')


In [10]:
import itertools
from tqdm import tqdm

numPRD = [11]
N = [45]
alpha = [1]
beta = [22]

parameters = list(itertools.product(numPRD, N, alpha, beta))
for numPRD, N, alpha, beta in tqdm(parameters, colour='red'):

# lmjm_rocchio(numPRD=numPRD,N=N,alpha=alpha,beta=beta, weightScheme='BM25')
    bm25_rocchio(numPRD=numPRD, N=N, alpha=alpha, beta=beta, weightScheme='BM25')


100%|[31m██████████[0m| 1/1 [01:00<00:00, 60.61s/it]


### LMJM + Rocchio Retrieval

In [160]:
# def lmjm_rocchio(numPRD, N, alpha, beta, weightScheme='TFIDF'):
#     """ Performs LMJM search with Rocchio pseudo relevance feedback 
#         on a set of queries and output the result in a file

#     Args:
#         numPRD: no. of pseudo relevant docs
#         N: no. of expansion terms
#         alpha, beta: Rocchio model parameters
#         weightScheme (string): TFIDF or BM25 for term weighting
        
#     Returns:
#         None
#     """
     
    
#     model = 'lmjm'
#     LAMBDA = 0.4   # LM-JM baseline lambda parameter
#     similarityModel = LMJelinekMercerSimilarity(LAMBDA)

#     # k1 = 0.8
#     # b = 0.4
#     # similarityModel = BM25Similarity(k1,b)

#     # change result file path below
#     if weightScheme == 'BM25' or weightScheme == 'TFIDF':
#         rocchioOutputPath = f"./Rocchio_output/{weightScheme}/{q_name}_LMJM_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
#     else:
#         print('Warning: weightScheme entered not a valid parameter value. Taking default weightScheme: TFIDF')
#         weightScheme = 'TFIDF'
#         rocchioOutputPath = f"./Rocchio_output/{weightScheme}/{q_name}_LMJM_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
    
#     f = open(rocchioOutputPath, 'w')

#     # setting up the searcher
#     analyzer = EnglishAnalyzer()    # used same analyzer as indexer
#     index_path = '../../TREC678/documents_index/'
#     directory = FSDirectory.open(File(index_path).toPath())
#     searcher = IndexSearcher(DirectoryReader.open(directory))
#     # setting the similarity model
#     searcher.setSimilarity(similarityModel)

#     print('\nRetrieving ...')

#     # search on 50 queries from the topic file 'trec6.xml'
#     for topic in topics:
#         qidField = 'num'
#         queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'

#         qid = topic.find(qidField).text.strip()
#         q = topic.find(queryKeywordsField).text.strip()

#         escaped_q = QueryParser(FIELDNAME, analyzer).escape(q)      # a few titles had '/' in them which 
#                                                                     # EnglishAnalyzer was not able to parse
#                                                                     # without escaping those special characters
#         query = QueryParser(FIELDNAME, analyzer).parse(escaped_q)

#         print(f'Rocchio {weightScheme}, numPRD = {numPRD}, N = {N}, alpha = {alpha}, beta = {beta}; qid = {qid}, retrieving & writing ...', end=' ')

#         # getting the top pseudo relevant docs using the searcher
#         scoreDocs = searcher.search(query, numPRD).scoreDocs

#         # Rocchio expanded query retrieval
#         modified_query = rocchio_PRF(query, scoreDocs, N=N, alpha=alpha, beta=beta, weightScheme=weightScheme)

#         # getting the top k search results using the searcher
#         k = 1000
#         scoreDocs = searcher.search(modified_query, k).scoreDocs

#         # writing all k doc results in a .res file in TREC format
#         rank = 0
#         results = ''
#         for scoreDoc in scoreDocs:
#             rank += 1
#             doc = searcher.doc(scoreDoc.doc)
#             # f.write(f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_lmjm_{LAMBDA}-rocchio_{alpha}_{beta}\n")
#             results += f"{qid}\tQ0\t{doc.get('ID')}\t{rank}\t{scoreDoc.score}\taman_lmjm_{LAMBDA}-rocchio_{alpha}_{beta}\n"
        
#         f.write(results)

#         print('complete!')

#     f.close()
#     print('Search completed! Search results exported to a .res file in the current directory.\n')

#### Finding max MAP for LMJM+Rocchio-TFIDF

In [161]:
# numPRD = 11
# N = 45

# alpha = 1
# beta = 30

# lmjm_rocchio(numPRD=numPRD,N=N,alpha=alpha,beta=beta, weightScheme='TFIDF')
# # lmjm_rocchio(numPRD=numPRD, N=N, alpha=alpha, beta=beta, weightScheme='TFIDF')

In [162]:
# highest MAP value and corresponding params
# LMJM with Rocchio TFIDF

