In [1]:
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import matplotlib.patches as mpatches

# Read data and preprocess

In [3]:
bm=pd.read_csv('BM25Ranking_moodl.csv')
q=pd.read_csv('QueryLikelihoodRanking_moodl.csv')
ti=pd.read_csv('TFIDFRanking_moodl.csv')

In [4]:
def change_binary(data):
    non_rel=[]
    for x in data.relevance:
        if x>0:
            non_rel.append(1)
        else: 
            non_rel.append(0)
    data['b_relevance']=non_rel
    return data
q=change_binary(q)
bm=change_binary(bm)
ti=change_binary(ti)

In [5]:
frames = [bm,q,ti]
comb = pd.concat(frames)

In [6]:
comb.relevance.value_counts()

0    84
2     6
Name: relevance, dtype: int64

In [7]:
cl_comb=comb.drop_duplicates()

In [8]:
cl_comb.relevance.value_counts()

0    44
2     2
Name: relevance, dtype: int64

there are overall 46 links, where 44 zeros, 0 ones and 2 twos. 

4 non-relevant docs, 2 relevant docs.

# calculate metrics

In [9]:
def pr_curve(retrieved_list,num_pos):
    a=retrieved_list
    #recall= retrieved relevant/retrieved relevant+not retrieved relevant
    recall=[]
    #precision = retrieved relevant/total retrieved
    precision=[]
    #num_pos=sum(t)
    for i in range(1,len(a)):
        recall.append(sum(a[:i])/num_pos)
        precision.append(sum(a[:i])/i)
    a=plt.plot(recall,precision,'-')
    #plt.xlabel('recall')
    #plt.ylabel('precision')
    #plt.show()
    return a


In [12]:
a=pr_curve(bm.b_relevance,10),
b=pr_curve(q.b_relevance,10),
c=pr_curve(ti.b_relevance,10)

In [None]:
plt.show(b)
plt.xlabel('recall')
plt.ylabel('precision')
plt.title('precision against recall plot')
green_patch = mpatches.Patch(color='green', label='tf-idf')
red_patch = mpatches.Patch(color='orange', label='QueryLikelihoodRanking')
blue_patch = mpatches.Patch(color='blue', label='bm25')
plt.legend(handles=[red_patch,green_patch,blue_patch])


# recall, precision, F1

In [51]:

def f1_pr_rc(retrieved_list,num_pos):
    a=retrieved_list
    r=sum(a)/num_pos #recall
    p=sum(a)/len(a) #precision
    F1=2*r*p/(r+p)
    return r,p,F1


In [58]:
q_rpf=f1_pr_rc(q.b_relevance,10)
bm_rpf=f1_pr_rc(bm.b_relevance,10)
ti_rpf=f1_pr_rc(ti.b_relevance,10)

In [62]:
bm_rpf,q_rpf,ti_rpf

((1.0, 0.33333333333333331, 0.5),
 (0.69999999999999996, 0.23333333333333334, 0.34999999999999998),
 (0.80000000000000004, 0.26666666666666666, 0.40000000000000002))

# Average Precision( AP)

In [64]:
def average_precision(retrieved_list):
    a=retrieved_list
    #a =[1,0,1,1,0,0,1,0,0,1]
    p2=[]
    for i,ai in enumerate(a):
        if ai == 1:
            p2.append(sum(a[:i+1])/(i+1))
    ap=sum(p2)/len(p2)
    return ap

In [70]:
q_ap=average_precision(q.b_relevance)
bm_ap=average_precision(bm.b_relevance)
ti_ap=average_precision(ti.b_relevance)

In [71]:
q_ap,
bm_ap,
ti_ap

1.0

# precision at rank k(1,5,10)

In [72]:
def rank_precision(rank_k,retrieved_list):
    precision=sum(retrieved_list[:rank_k])/rank_k
    return precision

In [75]:
q_k=[rank_precision(1,q.b_relevance),rank_precision(5,q.b_relevance),rank_precision(10,q.b_relevance)]
bm_k=[rank_precision(1,bm.b_relevance),rank_precision(5,bm.b_relevance),rank_precision(10,bm.b_relevance)]
ti_k=[rank_precision(1,ti.b_relevance),rank_precision(5,ti.b_relevance),rank_precision(10,ti.b_relevance)]

In [77]:
q_k,bm_k,ti_k

([1.0, 1.0, 0.69999999999999996],
 [1.0, 1.0, 1.0],
 [1.0, 1.0, 0.80000000000000004])

# R rank precision(1,3,5) and reciprocal rank (r=1)

In [82]:
def r_rank(num_rd,retrieved_list):
    a_trans=np.cumsum(retrieved_list)
    a_l = list(a_trans)
    r_precision=num_rd/(a_l.index(num_rd)+1)
    return r_precision

In [83]:
q_r=[r_rank(1,q.b_relevance),r_rank(3,q.b_relevance),r_rank(5,q.b_relevance)]
bm_r=[r_rank(1,bm.b_relevance),r_rank(3,bm.b_relevance),r_rank(5,bm.b_relevance)]
ti_r=[r_rank(1,ti.b_relevance),r_rank(3,ti.b_relevance),r_rank(5,ti.b_relevance)]

In [84]:
q_r,bm_r,ti_r

([1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0])

# Search length

all ones

# DCG

In [145]:
def DCG(k,retrieved_list):
    return retrieved_list[0] + np.sum(retrieved_list[1:k] / np.log2(np.arange(2, k + 1)))


ValueError: operands could not be broadcast together with shapes (0,) (2,) 

In [148]:
DCG(30,bm.relevance),DCG(30, q.relevance),DCG(30,ti.relevance)

(7.2544945117704573, 6.3046663059874142, 6.6379996393207472)

# Significant level test 