## Correlation between Findability and Retrievability

##### Imports

In [1]:
# imports
import pickle
import csv
import scipy
import rbo
import numpy as np
import matplotlib.pyplot as plt

##### load data

In [3]:

with open('./Findability-experiments/Robust04/fd_bm25_1.20_0.75.pickle', 'rb') as f:
    fd_robust04 = pickle.load(f)

with open('./Retrievability-experiments/allrd_robust04.pickle', 'rb') as f:
    allrd_robust04 = pickle.load(f)
    
with open('./Findability-experiments/WT10g/fd_bm25_1.20_0.75.pickle', 'rb') as f:
    fd_wt10g = pickle.load(f)

with open('./Retrievability-experiments/allrd_WT10g.pickle', 'rb') as f:
    allrd_wt10g = pickle.load(f)

with open('./Findability-experiments/MSMARCO/fd_bm25_1.20_0.75.pickle', 'rb') as f:
    fd_msmarco = pickle.load(f)

with open('./Retrievability-experiments/allrd_MSMARCO.pickle', 'rb') as f:
    allrd_msmarco = pickle.load(f)

#### Pearson's Correlation Coefficient

In [4]:

print("\nPearson's Correlation Coefficient calculation\n")

############################################

corpus = 'Robust04'
fd = fd_robust04
rd = dict(allrd_robust04['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Pearson's correlation computation
rho, pval = scipy.stats.pearsonr(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t rho = {rho:.4f}\t\tp-value = {pval}')

############################################

corpus = 'WT10g'
fd = fd_wt10g
rd = dict(allrd_wt10g['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Pearson's correlation computation
rho, pval = scipy.stats.pearsonr(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t rho = {rho:.4f}\t\tp-value = {pval}')

############################################

corpus = 'MSMARCO'
fd = fd_msmarco
rd = dict(allrd_msmarco['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Pearson's correlation computation
rho, pval = scipy.stats.pearsonr(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t rho = {rho:.4f}\t\tp-value = {pval}')


Pearson's Correlation Coefficient calculation

For Robust04, Findability f(d) vs Retrievability r(d) for c = 100:	 rho = -0.0944		p-value = 0.0
For WT10g, Findability f(d) vs Retrievability r(d) for c = 100:	 rho = -0.0088		p-value = 8.369595764235922e-30
For MSMARCO, Findability f(d) vs Retrievability r(d) for c = 100:	 rho = 0.0115		p-value = 5.766135566854784e-250


#### Kendall Rank Correlation Coefficient

In [5]:
print("\nKendall's Correlation Coefficient calculation\n")

###########################################

corpus = 'Robust04'
fd = fd_robust04
rd = dict(allrd_robust04['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Kendall's correlation computation
corr, pval = scipy.stats.kendalltau(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t tau = {corr:.4f}\t\tp-value = {pval}')

############################################

corpus = 'WT10g'
fd = fd_wt10g
rd = dict(allrd_wt10g['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Kendall's correlation computation
corr, pval = scipy.stats.kendalltau(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t tau = {corr:.4f}\t\tp-value = {pval}')

############################################

corpus = 'MSMARCO'
fd = fd_msmarco
rd = dict(allrd_msmarco['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Kendall's correlation computation
corr, pval = scipy.stats.kendalltau(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t tau = {corr:.4f}\t\tp-value = {pval}')


Kendall's Correlation Coefficient calculation

For Robust04, Findability f(d) vs Retrievability r(d) for c = 100:	 tau = -0.0518		p-value = 0.0


  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


For WT10g, Findability f(d) vs Retrievability r(d) for c = 100:	 tau = 0.0084		p-value = 2.1520587221635985e-57
For MSMARCO, Findability f(d) vs Retrievability r(d) for c = 100:	 tau = 0.0307		p-value = 0.0


##### Retrievability on Findability's Known-item queries

In [7]:
with open('./Findability-experiments/Robust04/fd_bm25_1.20_0.75.pickle', 'rb') as f:
    fd_robust04 = pickle.load(f)

with open('./Retrievability-experiments/Known-item-queries-rd/allrd_Robust04.pickle', 'rb') as f:
    allrd_robust04 = pickle.load(f)

with open('./Findability-experiments/WT10g/fd_bm25_1.20_0.75.pickle', 'rb') as f:
    fd_wt10g = pickle.load(f)

with open('./Retrievability-experiments/Known-item-queries-rd/allrd_WT10g.pickle', 'rb') as f:
    allrd_wt10g = pickle.load(f)

with open('./Findability-experiments/MSMARCO/fd_bm25_1.20_0.75.pickle', 'rb') as f:
    fd_msmarco = pickle.load(f)

with open('./Retrievability-experiments/Known-item-queries-rd/allrd_MSMARCO.pickle', 'rb') as f:
    allrd_msmarco = pickle.load(f)

In [8]:
print("\nPearson's Correlation Coefficient calculation\n")

############################################

corpus = 'Robust04'
fd = fd_robust04
rd = dict(allrd_robust04['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Pearson's correlation computation
rho, pval = scipy.stats.pearsonr(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t rho = {rho:.4f}\t\tp-value = {pval}')

############################################

corpus = 'WT10g'
fd = fd_wt10g
rd = dict(allrd_wt10g['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Pearson's correlation computation
rho, pval = scipy.stats.pearsonr(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t rho = {rho:.4f}\t\tp-value = {pval}')

############################################

corpus = 'MSMARCO'
fd = fd_msmarco
rd = dict(allrd_msmarco['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Pearson's correlation computation
rho, pval = scipy.stats.pearsonr(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t rho = {rho:.4f}\t\tp-value = {pval}')

############################################
############################################

print("\nKendall's Correlation Coefficient calculation\n")

###########################################

corpus = 'Robust04'
fd = fd_robust04
rd = dict(allrd_robust04['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Kendall's correlation computation
corr, pval = scipy.stats.kendalltau(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t tau = {corr:.4f}\t\tp-value = {pval}')

###########################################

corpus = 'WT10g'
fd = fd_wt10g
rd = dict(allrd_wt10g['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Kendall's correlation computation
corr, pval = scipy.stats.kendalltau(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t tau = {corr:.4f}\t\tp-value = {pval}')

###########################################

corpus = 'MSMARCO'
fd = fd_msmarco
rd = dict(allrd_msmarco['rd_bm25_100'])

common_lucenepageids = set(fd) & set(rd)
fd_list, rd_list = [], []
for pageid in sorted(common_lucenepageids):
    fd_list.append(fd[pageid])
    rd_list.append(rd[pageid])
fd_list, rd_list = zip(*sorted(zip(fd_list,rd_list), reverse=True))
# Kendall's correlation computation
corr, pval = scipy.stats.kendalltau(fd_list, rd_list)
print(f'For {corpus}, Findability f(d) vs Retrievability r(d) for c = 100:\t tau = {corr:.4f}\t\tp-value = {pval}')


Pearson's Correlation Coefficient calculation

For Robust04, Findability f(d) vs Retrievability r(d) for c = 100:	 rho = -0.1292		p-value = 0.0
For WT10g, Findability f(d) vs Retrievability r(d) for c = 100:	 rho = -0.0256		p-value = 2.1440237772913133e-238
For MSMARCO, Findability f(d) vs Retrievability r(d) for c = 100:	 rho = 0.0388		p-value = 0.0

Kendall's Correlation Coefficient calculation

For Robust04, Findability f(d) vs Retrievability r(d) for c = 100:	 tau = -0.1053		p-value = 0.0


  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


For WT10g, Findability f(d) vs Retrievability r(d) for c = 100:	 tau = -0.0287		p-value = 0.0
For MSMARCO, Findability f(d) vs Retrievability r(d) for c = 100:	 tau = 0.0269		p-value = 0.0
