# Evaluating different Information Retrieval Ranking Models

For this assignment [Pyndri](https://github.com/cvangysel/pyndri) [[1](https://arxiv.org/abs/1701.00749)] was used which is  a python interface for [Indri](https://www.lemurproject.org/indri.php). 

In [None]:
import pyndri
import numpy as np
from collections import Counter
from math import log
import numpy as np
import sys
from collections import Counter
import pandas as pd
import copy
import re
#import matplotlib.pyplot as plt

In [None]:
index = pyndri.Index('index/')
token2id, id2token, id2df = index.get_dictionary()
# Collection frequency
id2cf = index.get_term_frequencies()

In [None]:
# Reading in the query file
with open('/home/student/assignment/ap_88_89/topics_titles_validation.csv', 'r') as file:
    querylist = file.readlines()
queries = [(i.split(';')[0], i.split(',')[1].strip('\n')) for i in querylist]

In [None]:
# Defining functions

# Function to convert the query to a list of word_ids
def query_preprocess(query):
    query = re.sub('[!@#/$1234567890;()"]',' ',query)
    query_tokens = query.lower().split()
    query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
    query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]
    return query_id_tokens 



# Function to calculate the tf 
def tf(document_id,query_term):
    a = index.document(document_id)[1]
    tf = a.count(query_term)
    return tf


# Function to calculate the idf of a query term
def idf(query_term):
    N = (index.maximum_document() - index.document_base())
    df = id2df[query_term]
    b = (N/int(df))
    return log(b)



# Function that combines the query term scores into a score for the whole query
def query_dict(query_term_dict):
    query_dict = {}

    for query in queries:
        query_term_ids = (query_preprocess(query[1]))
        query_nr = (query[0].split(',')[0])
        dict1 = {0:0}
        query = Counter(dict1)
        for query_term in query_term_ids:
            query = query + Counter(query_term_dict[query_term])
        query = query.most_common(1000)
        query_dict[query_nr]= query

    return query_dict

# Function that handles negative scores (not implemented)
def query_dictlogs(query_term_dict):
    query_dict = {}

    for query in queries:
        query_term_ids = (query_preprocess(query[1]))
        query_nr = (query[0].split(',')[0])
        dict1 = {0:0}
        query = Counter(dict1)
        for query_term in query_term_ids:
            query = query + Counter(query_term_dict)
        query = query.most_common()[:-1000-1:-1]
        query_dict[query_nr]= query

    return query_dict

# Function to create a data frame from the query dictionary.  
def report(query2method, method = str()):
    q_nums = []
    doc_nums = []
    score_x = []

    for key, value in query2method.items():
        for x in value:
            q_nums.append(key)
            doc_nums.append(x[0])
            score_x.append(x[1])
            
    results = pd.DataFrame()
    results['querynr'] = q_nums
    results['doc numbers'] = doc_nums
    results['scores'] = score_x

    # Creating the column with document names
    docnames = []
    for doc in doc_nums:
        docname = index.document(doc)[0]
        docnames.append(docname)

    # Adding missing columns
    results['docnames']=docnames
    results['method'] = method
    results['dont know2'] = 'Q0'
    results['rank']=results.groupby('querynr')['scores'].rank(ascending = False)
    results = results[['querynr', 'dont know2', 'docnames', 'rank', 'scores', 'method']]
    
    return results

In [None]:
# Making a list of all unique query terms 

all_word_ids = []
for q in queries:
    word_ids =  query_preprocess((q[1]))
    for word_id in word_ids:
        all_word_ids.append(word_id)

all_word_ids = list(set(all_word_ids))

In [None]:
# Making a dictionary with term frequencies {query_term_id:{document_id:tf}}
query_term2tf = {}
for query_term_id in all_word_ids:
    query_term2tf[query_term_id] = {}
for i, document_id in enumerate(range(index.document_base(),index.maximum_document())):
    if i % 10000 == 0:
        print(i)
    a = index.document(document_id)[1]
   
    cnt = Counter(a)
   
    for word_id in all_word_ids:
        if word_id not in cnt:
            continue
        tf = cnt[word_id]
        query_term2tf[word_id][document_id] = tf
    
    #return tf

In [None]:
#Calculating idf for each query term
query_term_idf_scores = []
for query_term in all_word_ids:
    query_term_idf_scores.append(idf(query_term))
    
# Creating a dictionary {query_term:idf_score}
query_term2idf=dict(zip(all_word_ids,query_term_idf_scores))

# TF-IDF 

In [None]:
# Making a dictionary {query_term:{doc_nr:tf_idf score}}

query_term2tfidf=copy.deepcopy(query_term2tf)
for query_term in query_term2tfidf:
    for key in query_term2tfidf[query_term]:
        # this version was used in the first test
        #query_term2tfidf[query_term][key]=query_term2tf[query_term][key]*query_term2idf[query_term]
        
        query_term2tfidf[query_term][key] = np.log2(1 + query_term2tf[query_term][key])
        query_term2tfidf[query_term][key] = query_term2tfidf[query_term][key] * (np.log2(index.document_count()/id2df.get(query_term)))
        
        
#print (query_term2tfidf[13][2])

In [None]:
# Creating a dictionary {query_number:{document,tf_idf}}
query2tfidf = query_dict(query_term2tfidf)

In [None]:
# Making a report 
tfidfresults = report(query2tfidf,"tfidf")
print (tfidfresults.head())

In [None]:
# Creating a csv report for trec_eval
tfidf = tfidfresults.to_csv("tfidf2", sep = '\t', index = False, header = False)

# BM25

In [None]:
# Calculating the average length of a document
sum = 0
for i in range(index.document_base(),index.maximum_document()):
    sum = sum + index.document_length(key)

N = (index.maximum_document() - index.document_base())
lenave = float(sum)/float(N)
print (lenave)

In [None]:
# Making a dictionary {query_term:{doc_nr: bm25 score}}
query_term2bm25=copy.deepcopy(query_term2tf)

k1 = 1.5
b = 0.75

for query_term in query_term2bm25:
    for key in query_term2bm25[query_term]:
        query_term2bm25[query_term][key] = query_term2tf[query_term][key] * (k1+1)
        query_term2bm25[query_term][key] = query_term2bm25[query_term][key]/(query_term2tf[query_term][key] + k1 * ((1 - b) + b * index.document_length(key))/lenave)
        query_term2bm25[query_term][key] = query_term2bm25[query_term][key] * idf(query_term)


In [None]:
# Creating a dictionary {query_number:{document,tf_idf}}
query2bm25 = query_dict(query_term2bm25)

In [None]:
# Creating a report
bm25results = report(query2bm25,"bm25")

In [None]:
# Creating a csv report for trec_eval
bm25 = bm25results.to_csv("bm25ver2", sep = '\t', index = False, header = False)

# Comparing Tf-idf with BM25

In [None]:
N = 30
TF_MAP = (0.3132, 0.5399, 0.0192, 0.0037, 0.7578, 
          0.0834, 0.0274, 0.1631, 0.0012, 0.0895, 
          0.0021, 0.0018, 0.2607, 0.2303, 0.2875, 
          0.0048, 0.0190, 0.2998, 0.0001, 0.0326, 
          0.1025, 0.0010, 0.0013, 0.0583, 0.0404, 
          1.0000, 0.5681, 0.2795, 0.0192, 0.4809)


ind = np.arange(N)  # the x locations for the groups

width = 0.5       # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind, TF_MAP, width, color="#2b8cbe")

BM25 = (0.2634, 0.5121, 0.1233, 0.0011, 0.8352,
        0.3336, 0.0483, 0.4170, 0.0119, 0.3899,
        0.0015, 0.0006, 0.1729, 0.6019, 0.2168,
        0.0513, 0.0670, 0.1963, 0.0000, 0.0169,
        0.2772, 0.0004, 0.0057, 0.2050, 0.1533,
        0.5889, 0.6141, 0.208, 0.2048, 0.3746)
rects2 = ax.bar(ind + width, BM25, width, color="#e34a33")

# add some text for labels, title and axes ticks
ax.set_ylabel('Scores')
ax.set_title('Tf-idf score')
ax.set_xticks(ind + width)
ax.set_xticklabels(("53", "57", "69", "74", "78",
                    "86", "89", "90", "92", "93",
                    "94", "95", "103", "111", "114", 
                    "120", "123", "135", "143", "144", 
                    "151", "155", "158", "165", "167", 
                    "170", "173", "180", "182", "192"))


ax.legend((rects1[0], rects2[0]), ('Tfidf', 'BM25'))

#ax.invert_xaxis()

fig.set_size_inches(18.5, 10.5, forward=True)

font = {'family' : 'monospace',
        'weight' : 'normal',
        'size'   : 16}

matplotlib.rc('font', **font)

plt.show()

# Jelinek-Mercer Language Model

In [None]:
# Jelinek-Mercer language model
lamda = 0.5 # Was varied 0.05, 0.1, 0.2, 0.5, 0.9

# Making a dictionary {query_term:{doc_nr:jelinek score}}
query_term2jelinek=copy.deepcopy(query_term2tf)

for query_term in query_term2jelinek:
    for doc_nr in query_term2jelinek[query_term]:
        query_term2jelinek[query_term][doc_nr] = lamda * query_term2tf[query_term][doc_nr]/ index.document_length(doc_nr)
        query_term2jelinek[query_term][doc_nr] = query_term2jelinek[query_term][doc_nr] + (1-lamda)*(id2cf[query_term]/index.total_terms())
        #query_term2jelinek[query_term][doc_nr] = np.abs(np.log(query_term2jelinek[query_term][doc_nr]))

In [None]:
# Creating a dictionary {query_number:{document,tf_idf}}
query2jelinek = query_dict(query_term2jelinek)
query_dict = {}

In [None]:
jelinekresults = report(query2jelinek, "jelinek")
print (jelinekresults.head())

In [None]:
# creating a csv report for trec_eval
jelinek = jelinekresults.to_csv("jelinek05", sep = '\t', index = False, header = False)

# Dirichlet Prior Language Model

In [None]:
# Dirichlet Prior language model 
mu = 0 

# Making a dictionary {query_term:{doc_nr:dirichlet score}}
query_term2dirichlet=copy.deepcopy(query_term2tf)

for query_term in query_term2dirichlet:
    for key in query_term2dirichlet[query_term]:
        query_term2dirichlet[query_term][key] = index.document_length(key) / (index.document_length(key)+mu)
        query_term2dirichlet[query_term][key] = query_term2dirichlet[query_term][key] * query_term2tf[query_term][key] / index.document_length(key)
        query_term2dirichlet[query_term][key] = query_term2dirichlet[query_term][key] + (mu/(mu+index.document_length(key))) * id2cf[query_term]/index.total_terms()


In [None]:
# Creating a dictionary {query_number:{document:jelinek}}
query2dirichlet = query_dict(query_term2dirichlet)

In [None]:
# Creating a report
dirichletresults = report(query2dirichlet,"dirichlet")

In [None]:
# creating a csv report for trec_eval
dirichlet = dirichletresults.to_csv("dirichlet1500", sep = '\t', index = False, header = False)

# Absolute Discounting

In [None]:
# Absolute discounting
beta = 0.2

# Making a dictionary {query_term:{doc_nr:absdisc score}}
query_term2absdisc=copy.deepcopy(query_term2tf)

for query_term in query_term2dirichlet:
    for key in query_term2dirichlet[query_term]:
        query_term2absdisc[query_term][key] = max(query_term2tf[query_term][key] - beta,0)/index.document_length(key)
        query_term2absdisc[query_term][key] = query_term2absdisc[query_term][key] + beta * len(set(index.document(key)[1]))/index.document_length(key)
        query_term2absdisc[query_term][key] = query_term2absdisc[query_term][key]* id2cf[query_term]/index.total_terms()

In [None]:
# Creating a dictionary {query_number:{document:jelinek}}
query2absdisc = query_dict(query_term2absdisc)

In [None]:
# Creating a report
absdiscresults = report(query2absdisc)

In [None]:
# creating a csv report for trec_eval

absdisc = absdiscresults.to_csv("absdisc05", sep = '\t', index = False, header = False)

# Language model plots showing NDCG@10 with varying values of the parameters.

In [None]:
#Jelinek
NDCG_scores = [0.1903, 0.1903, 0.1925, 0.1903, 0.1944]
lamda = [0.9, 0.5, 0.1, 0.2, 0.05]

ax.set_ylabel('NDCG@10')
ax.set_title('Jelinek: NDCG@10 for different lambda values')
plt.scatter(lamda, NDCG_scores)

In [None]:
#Dirichlet
NDCG_scores = [0.2487, 0.2448, 0.2638, 0.2471]
Mu_values = [1000, 2000, 500, 1500]

ax.set_ylabel('NDCG@10')
ax.set_title('Dirichlet: NDCG@10 for different mu-values')
plt.scatter(Mu_values, NDCG_scores)

In [None]:
# Absolute discounting

NDCG_scores = [0.0656, 0.0609, 0.06680]
Beta_values = [0.5, 0.9, 0.1]

ax.set_ylabel('NDCG@10')
ax.set_title('Absolute discounting: NDCG@10 for different mu-values')
plt.scatter(Beta_values, NDCG_scores)