In [2]:
import json
from ranx import Qrels, Run, evaluate, compare
import gzip

In [3]:
def read_json(filename):    
    
    with gzip.open(filename, "rt") as f:
        obj = json.load(f)
    
    return obj

In [4]:
label = 'EXP7'
struc_dir = f'./Dataset/SO/{label}/structures/'

In [5]:
qrel = Qrels(read_json(struc_dir + 'qrels_dict'))

In [None]:
baselines = ['BC', 'BM25', 'TUEFNB', 'TUEFLIN', 'TUEFSL', 'TUEFCB', 'TUEFNORW']
runs = []
for b in baselines:
    path = f'./Dataset/SO/EXP7/Baselines/{b}/structures/'
    runs.append(Run(read_json(path + 'run_dict')))
runs.append(Run(read_json(struc_dir + 'run_dict')))

In [7]:
# Compare different runs and perform Two-sided Paired Student's t-Test
report = compare(
    qrels=qrel,
    runs=runs,
    metrics=["precision@1", "ndcg@3", "recall@100", "mrr"],
    max_p=0.05  # P-value threshold,
)

In [8]:
report

#    Model    P@1          NDCG@3        Recall@100    MRR
---  -------  -----------  ------------  ------------  ------------
a    run_1    0.020        0.033         0.076         0.033
b    run_2    0.234ᵃᶜ      0.356ᵃᶜ       0.808ᵃᶜᵍ      0.369ᵃᶜ
c    run_3    0.066ᵃ       0.087ᵃ        0.213ᵃ        0.093ᵃ
d    run_4    0.264ᵃᵇᶜ     0.360ᵃᶜ       0.874ᵃᵇᶜᵉᶠᵍ   0.383ᵃᵇᶜ
e    run_5    0.436ᵃᵇᶜᵈ    0.560ᵃᵇᶜᵈ     0.826ᵃᵇᶜᵍ     0.559ᵃᵇᶜᵈ
f    run_6    0.447ᵃᵇᶜᵈᵉ   0.573ᵃᵇᶜᵈᵉᵍ   0.849ᵃᵇᶜᵉᵍ    0.572ᵃᵇᶜᵈᵉᵍ
g    run_7    0.443ᵃᵇᶜᵈ    0.561ᵃᵇᶜᵈ     0.754ᵃᶜ       0.552ᵃᵇᶜᵈ
h    run_8    0.453ᵃᵇᶜᵈᵉᵍ  0.578ᵃᵇᶜᵈᵉᶠᵍ  0.874ᵃᵇᶜᵉᶠᵍ   0.579ᵃᵇᶜᵈᵉᶠᵍ