## Rocchio True Relevance Feedback


Rocchio Query Expansion using True Relevance Feedback

### imports and file path

In [3]:
import math
import lucene
from java.io import File
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import BytesRefIterator
from org.apache.lucene.index import DirectoryReader, Term
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher, BooleanQuery, BooleanClause, TermQuery, BoostQuery
from org.apache.lucene.search.similarities import BM25Similarity, LMJelinekMercerSimilarity, LMDirichletSimilarity
lucene.initVM()

<jcc.JCCEnv at 0x7f414d5d3b30>

In [4]:
import xml.etree.ElementTree as ET

indexPath = '../../TREC678/documents_index/'
topicFilePath = '../../trec6.xml'
# topicFilePath = '../../trec678-robust.xml'  # 50 queries

tree = ET.parse(topicFilePath)
topics = tree.getroot()


index_path = indexPath
directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

#### Rocchio

Average query langth and average documents length in collection

In [3]:
FIELDNAME = 'CONTENTS'       # Lucene index field name

# calculating avgdl for queries. Used in BM25_query().
analyzer = EnglishAnalyzer()
query_lens = []
for topic in topics:
    queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'
    q = topic.find(queryKeywordsField).text.strip()
    escaped_q = QueryParser(FIELDNAME, analyzer).escape(q)      # a few titles had '/' in them which
    # EnglishAnalyzer was not able to parse
    # without escaping those special characters
    query = QueryParser(FIELDNAME, analyzer).parse(escaped_q)
    query_terms = [term.strip()[len(FIELDNAME)+1:]
                   for term in query.toString().split()]
    query_lens.append(len(query_terms))
avgdl_query = sum(query_lens)/len(query_lens)

# calculating avgdl for the corpus. Used in BM25_docVec().
N = indexReader.numDocs()
avgdl_collection = indexReader.getSumTotalTermFreq(FIELDNAME)/N

In [4]:
def makeRelJudgeDict(qrelFilePath):
    # returns a nested dictionary representation of trec678 qrel file
    # for faster rel judgement checks during Rocchio query expansion.
    # Nested Dict strcture is like,
    # {qid1:{docid1:0/1,docid2:0/1,...}, qid2:{docid2:0/1,docid4:0/1,...},...}
    relJudgeDict = {}
    with open(qrelFilePath, 'r') as f:
        for line in f:
            l = line.split()
            qid, docid, judgement = l[0], l[2], int(l[3])
            if qid not in relJudgeDict:
                relJudgeDict[qid] = {docid: judgement}
            else:
                relJudgeDict[qid][docid] = judgement
    return relJudgeDict


def isTrueRelevant(qid, docid, relJudgeDict):
    # returns if the doc is True relevant, for the given query, according to the judgment file
    if qid not in relJudgeDict:
        return False
    if docid not in relJudgeDict[qid]:
        return False
    if relJudgeDict[qid][docid] == 1:   # 1 -> Relevant TRF
        return True
    if relJudgeDict[qid][docid] == 0:
        return False


def isTrueNonRelevant(qid, docid, relJudgeDict):
    # returns if the doc is NOT true relevant, for the given query, according to the judgment file
    if qid not in relJudgeDict:
        return False
    if docid not in relJudgeDict[qid]:
        return False
    if relJudgeDict[qid][docid] == 0:   # 0 -> Non-relevant TRF
        return True
    if relJudgeDict[qid][docid] == 1:
        return False


# SET this to the relevance judgment file path
qrelPath = '../../trec678_robust.qrel'
# making a nested dictionary representation of judgement file for faster access
relJudgeDict = makeRelJudgeDict(qrelPath)


In [None]:
for keys in relJudgeDict:
    count = 0
    for values in relJudgeDict[keys].values():
        if values == 1:
            count += 1
    print(f'keys:{keys}_reldocs:{count}')

In [22]:
def tf_idf_query(term, query_terms):
    # returns TF-IDF weight for the given term in query
    D = len(query_terms)
    N = indexReader.numDocs()
    tf = query_terms.count(term)
    df = indexReader.docFreq(Term(FIELDNAME, term))
    weight = (tf/D)*(math.log(N/(df+1)))
    return weight


def tf_idf_docVec(docVec, D):
    # tf-idf weight calculation for all the terms in the document vector
    N = indexReader.numDocs()       # no. of total docs in the corpus
    for t in docVec:
        tf = docVec[t][0]
        df = docVec[t][1]
        idf = math.log(N/(df+1))
        docVec[t] = (tf/D)*idf
    return docVec


def BM25_query(term, query_terms, k1=0.8, b=0.4):
    # returns Okapi BM25 weight for the given term in query
    D = len(query_terms)
    N = indexReader.numDocs()
    tf = query_terms.count(term)
    df = indexReader.docFreq(Term(FIELDNAME, term))
    idf = math.log(1+((N-df+0.5)/(df+0.5)))
    weight = ((tf*(1+k1))/(tf+k1*((1-b)+(b*D/avgdl_query))))*idf
    return weight


def BM25_docVec(docVec, D, k1=0.8, b=0.4):
    # Okapi BM25 weight calculation for all the terms in the document vector
    N = indexReader.numDocs()       # no. of total docs in the corpus
    for t in docVec:
        tf = docVec[t][0]
        df = docVec[t][1]
        idf = math.log(1+((N-df+0.5)/(df+0.5)))
        docVec[t] = ((tf*(1+k1))/(tf+k1*((1-b)+(b*D/avgdl_collection))))*idf
    
    return docVec


def getDocumentVector(luceneDocid, weightScheme):
    # returns document vector in dictionary form with tf-idf weights
    
    docVec = {}                     
    
    D = 0                           # doc length, i.e., total no. of tokens in the doc
    terms = indexReader.getTermVector(luceneDocid, FIELDNAME)
    iterator = terms.iterator()
    for term in BytesRefIterator.cast_(iterator):
        t = term.utf8ToString()
        tf = iterator.totalTermFreq()                           # termFreq of term,t
        df = indexReader.docFreq(Term(FIELDNAME, t))            # docFreq of term,t
        D += tf
        docVec[t] = [tf,df]
        
    if weightScheme == 'TFIDF':
        docVec = tf_idf_docVec(docVec, D)
    elif weightScheme == 'BM25':
        docVec = BM25_docVec(docVec, D)

    
    docVec = {key: value/sum(docVec.values()) for key, value in docVec.items()}
    
    return docVec


def rocchio_TRF(query, qid, top_k_docs, searcher, N, alpha, beta, gamma, weightScheme):
    """Implements Rocchio's relevance feedback and returns a modified query

    Args:
        query (org.apache.lucene.search.Query): lucene parsed version of the initial/original query
        top_k_docs (lucene._lucene.JArray_object): scoreDocs returned after performing search with top k results
        N (int): number of terms to be in the returned modified query
        alpha (float): weight for original query
        beta (float): weight for positive feedback
        weightScheme (string): TFIDF or BM25 for term weighting

    Returns:
        list: expanded/modified query list of string query terms
    """
    
    # processing JQuery object to extract query terms in form of a list
    query_terms = [term.strip()[len(FIELDNAME)+1:] for term in query.toString().split()]
    
    # creating query vector Q0
    Q0_vector = {}
    for term in query_terms:
        if weightScheme == 'TFIDF':
            Q0_vector[term] = tf_idf_query(term, query_terms)
        elif weightScheme == 'BM25':
            Q0_vector[term] = BM25_query(term, query_terms)

    Q0_vector = {key: value/sum(Q0_vector.values()) for key, value in Q0_vector.items()}
    
    # Rel for Relevant, NRel for Non-relevant
    sumRelDocsVector, sumNRelDocsVector = {}, {}
    numRel, numNRel = 0, 0
    for scoreDoc in top_k_docs:

        doc = searcher.doc(scoreDoc.doc)
        docVec = getDocumentVector(scoreDoc.doc, weightScheme)
        if isTrueRelevant(qid, doc.get('ID'), relJudgeDict):
            
            numRel += 1
            # vector addition of sumRelDocsVector and docVec
            sumRelDocsVector = {term: sumRelDocsVector.get(term, 0) + docVec.get(term, 0) for term in set(sumRelDocsVector) | set(docVec)}
        if isTrueNonRelevant(qid, doc.get('ID'), relJudgeDict):
            numNRel += 1
            # vector addition of sumNRelDocsVector and docVec
            sumNRelDocsVector = {term: sumNRelDocsVector.get(term, 0) + docVec.get(term, 0) for term in set(sumNRelDocsVector) | set(docVec)}
    if numRel == 0:
        print(f'rel_vec_{sumRelDocsVector}')
        # print(f'Nrel_vec_{sumNRelDocsVector}')

    # normlaized Relevant Docs Vector
    r = {term: sumRelDocsVector[term]/numRel for term in sumRelDocsVector}
    # normlaized Non-Relevant Docs Vector
    nr = {term: sumNRelDocsVector[term]/numNRel for term in sumNRelDocsVector}

    # final Rocchio formula for Qm 
    # expanded_query = [[term, alpha*Q0_vector.get(term, 0) + beta*r.get(term, 0) - gamma*nr.get(term, 0)] for term in set(Q0_vector) | set(r) | set(nr)]
    expanded_query = [[term, alpha*Q0_vector.get(term, 0) + beta*r.get(term, 0) - gamma*nr.get(term, 0)] for term in set(Q0_vector) | set(r)]

    
    expanded_query.sort(key = lambda x: x[1], reverse=True)   # sorted (descending) the expanded query list as per term scores
    Qm_with_scores = expanded_query[:int(N)]     # selecting top N expanded query terms
    
    # weighting expanded query terms
    booleanQuery = BooleanQuery.Builder()
    # print(Qm_with_scores)
    # print(f'Num_rel_{numRel}')
    # print(f'NN_rel_{numNRel}')
    for item in Qm_with_scores:
        # if item[1] >= 0:
        t = Term(FIELDNAME, item[0])
        tq = TermQuery(t)
        boostedTermQuery = BoostQuery(tq, item[1])
        BooleanQuery.setMaxClauseCount(4096)
        booleanQuery.add(boostedTermQuery, BooleanClause.Occur.SHOULD)
    modifiedQuery = booleanQuery.build()
    
    return modifiedQuery   # modified query

### LMJM + Rocchio Retrieval

In [23]:
def lmjm_rocchio(numPRD, N, alpha, beta, gamma, weightScheme='TFIDF'):
    """ Performs LMJM search with Rocchio pseudo relevance feedback 
        on a set of queries and output the result in a file

    Args:
        numPRD: no. of pseudo relevant docs
        N: no. of expansion terms
        alpha, beta: Rocchio model parameters
        weightScheme (string): TFIDF or BM25 for term weighting
        
    Returns:
        None
    """
     
    
    model = 'lmjm'
    LAMBDA = 0.4   # LM-JM baseline lambda parameter
    similarityModel = LMJelinekMercerSimilarity(LAMBDA)

#     k1 = 0.8
#     b = 0.4
#     similarityModel = BM25Similarity(k1,b)

    # change result file path below
    if weightScheme == 'BM25' or weightScheme == 'TFIDF':
        rocchioOutputPath = f"./Rocchio_output/TRF/{weightScheme}_with_gamma/TREC6_LMJM_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_gamma={gamma}_{weightScheme}.res"
    else:
        print('Warning: weightScheme entered not a valid parameter value. Taking default weightScheme: TFIDF')
        weightScheme = 'TFIDF'
        rocchioOutputPath = f"./Rocchio_output/TRF/{weightScheme}_with_gamma/TREC6_LMJM_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_gamma={gamma}_{weightScheme}.res"
    
    f = open(rocchioOutputPath, 'w')

    # setting up the searcher
    analyzer = EnglishAnalyzer()    # used same analyzer as indexer
#     index_path = './index/'
    index = index_path
    directory = FSDirectory.open(File(index_path).toPath())
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # setting the similarity model
    searcher.setSimilarity(similarityModel)

    # print('\nRetrieving ...')

    # search on 50 queries from the topic file 'trec6.xml'
    for topic in topics:
        qidField = 'num'
        queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'

        qid = topic.find(qidField).text.strip()
        q = topic.find(queryKeywordsField).text.strip()

        escaped_q = QueryParser(FIELDNAME, analyzer).escape(q)      # a few titles had '/' in them which 
                                                                    # EnglishAnalyzer was not able to parse
                                                                    # without escaping those special characters
        query = QueryParser(FIELDNAME, analyzer).parse(escaped_q)

        # print(f'Rocchio {weightScheme}, numPRD = {numPRD}, N = {N}, alpha = {alpha}, beta = {beta}, gamma = {gamma} ; qid = {qid}, retrieving & writing ...', end=' ')

        # getting the top pseudo relevant docs using the searcher
        scoreDocs = searcher.search(query, numPRD).scoreDocs

        # Rocchio expanded query retrieval
        modified_query = rocchio_TRF(query, qid, top_k_docs=scoreDocs, searcher=searcher,  N=N, alpha=alpha, beta=beta, gamma=gamma, weightScheme=weightScheme)

        # getting the top k search results using the searcher
        k = 1000
        scoreDocs = searcher.search(modified_query, k).scoreDocs

        # writing all k doc results in a .res file in TREC format
        rank = 0
        results = ''
        for scoreDoc in scoreDocs:
            rank += 1
            doc = searcher.doc(scoreDoc.doc)
            # f.write(f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_lmjm_{LAMBDA}-rocchio_{alpha}_{beta}\n")
            results += f"{qid}\tQ0\t{doc.get('ID')}\t{rank}\t{scoreDoc.score}\tlmjm_{LAMBDA}-rocchio_{alpha}_{beta}_{gamma}\n"
        
        f.write(results)

        # print('complete!')

    f.close()
    # print('Search completed! Search results exported to a .res file in the current directory.\n')

### Finding max MAP for LMJM+Rocchio-TFIDF

In [39]:
numPRD = 35
N = 3
alpha = 1
for beta in [20]:
    for gamma in [1]:

# lmjm_rocchio(numPRD=numPRD,N=N,alpha=alpha,beta=beta, weightScheme='BM25')
        lmjm_rocchio(numPRD=numPRD, N=N, alpha=alpha, beta=beta,gamma=gamma, weightScheme='TFIDF')

rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}
rel_vec_{}


In [None]:
# from tqdm import tqdm
# alphas = [i/100 for i in range(25,201,25)]
# betas = [i/100 for i in range(25,201,25)]

# for numTRD in tqdm(range(10,31,5)):
#     for N in tqdm(range(50,121,10)):
#         for alpha in tqdm(alphas):
#             for beta in tqdm(betas):
#                 lmjm_rocchio(numPRD=numTRD, N=N, alpha=alpha, beta=beta, gamma=gamma, weightScheme='TFIDF')


In [1]:
# highest MAP value and corresponding params
# LMJM with Rocchio TFIDF
# [35, 115, 1.0, 20.0, 0.2834]
# max MAP = 0.2834, for numPRD = 35, N = 115, alpha = 1.0, beta = 20.0

### BM25 +  Rocchio Retrieval

In [30]:
def bm25_rocchio(numPRD, N, alpha, beta, weightScheme='TFIDF'):
    """ Performs bm25 search with Rocchio pseudo relevance feedback 
        on a set of queries and output the result in a file

    Args:
        numPRD: no. of pseudo relevant docs
        N: no. of expansion terms
        alpha, beta: Rocchio model parameters
        weightScheme (string): TFIDF or BM25 for term weighting
        
    Returns:
        None
    """
     
    
    model = 'bm25'
#     LAMBDA = 0.4   # LM-JM baseline lambda parameter
#     similarityModel = LMJelinekMercerSimilarity(LAMBDA)

    k1 = 0.8
    b = 0.4
    similarityModel = BM25Similarity(k1,b)

    # change result file path below
    if weightScheme == 'BM25' or weightScheme == 'TFIDF':
        rocchioOutputPath = f"./RESFILE/PRF/{weightScheme}/TREC6_BM25_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
    else:
        print('Warning: weightScheme entered not a valid parameter value. Taking default weightScheme: TFIDF')
        weightScheme = 'TFIDF'
        rocchioOutputPath = f"./RESFILE/PRF/{weightScheme}/TREC6_BM25_Rocchio_numPRD={numPRD}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
    
    f = open(rocchioOutputPath, 'w')

    # setting up the searcher
    analyzer = EnglishAnalyzer()    # used same analyzer as indexer
#     index_path = './index/'
    index = index_path
    directory = FSDirectory.open(File(index_path).toPath())
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # setting the similarity model
    searcher.setSimilarity(similarityModel)

    print('\nRetrieving ...')

    # search on 50 queries from the topic file 'trec6.xml'
    for topic in topics:
        qidField = 'num'
        queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'

        qid = topic.find(qidField).text.strip()
        q = topic.find(queryKeywordsField).text.strip()

        escaped_q = QueryParser(FIELDNAME, analyzer).escape(q)      # a few titles had '/' in them which 
                                                                    # EnglishAnalyzer was not able to parse
                                                                    # without escaping those special characters
        query = QueryParser(FIELDNAME, analyzer).parse(escaped_q)

        print(f'Rocchio {weightScheme}, numPRD = {numPRD}, N = {N}, alpha = {alpha}, beta = {beta}; qid = {qid}, retrieving & writing ...', end=' ')

        # getting the top pseudo relevant docs using the searcher
        scoreDocs = searcher.search(query, numPRD).scoreDocs

        # Rocchio expanded query retrieval
        modified_query = rocchio_PRF(query, scoreDocs, N=N, alpha=alpha, beta=beta, weightScheme=weightScheme)

        # getting the top k search results using the searcher
        k = 1000
        scoreDocs = searcher.search(modified_query, k).scoreDocs

        # writing all k doc results in a .res file in TREC format
        rank = 0
        results = ''
        for scoreDoc in scoreDocs:
            rank += 1
            doc = searcher.doc(scoreDoc.doc)
            # f.write(f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_lmjm_{LAMBDA}-rocchio_{alpha}_{beta}\n")
            results += f"{qid}\tQ0\t{doc.get('ID')}\t{rank}\t{scoreDoc.score}\tBM25_{k1}-{b}-rocchio_{alpha}_{beta}\n"
        
        f.write(results)

        print('complete!')

    f.close()
    print('Search completed! Search results exported to a .res file in the current directory.\n')

In [None]:
numPRD = 20
N = 70
alpha = 1
beta = 30

# lmjm_rocchio(numPRD=numPRD,N=N,alpha=alpha,beta=beta, weightScheme='BM25')
bm25_rocchio(numPRD=numPRD, N=N, alpha=alpha, beta=beta, weightScheme='TFIDF')

In [None]:
for d in [10,20,30,40,50]:
    for t in [10,20,30,40,50,60,70,80]:
        bm25_rocchio(numPRD=d, N=t, alpha=alpha, beta=beta, weightScheme='TFIDF')        