In [1]:
from randomized_tukey_hsd import *

In [2]:
def get_system_metrics(fname, metric):
    """Get a list of a chosen metric for a single system."""
    if metric == 'NDCG20':
        m = 2
    elif metric == 'ERR20':
        m = 3
    else:
        raise ValueError('Only `NDCG20` or `ERR20` can be used.')
    
    res = []
    with open(fname) as f:
        i = 0
        for line in f:
            i += 1
            if i == 1:
                continue
            elif i == 52:
                break
            spl = line.split(',')
            res.append(float(spl[m].strip()))
            
    assert len(res) == 50
    return res

In [3]:
def create_system_matrix(*arr):
    """Create a topic-by-run matrix."""
    return list(map(list, zip(*arr)))

We compare 5 systems in this example.
1. `bm25new`: standard BM25 with `k1=1.6` and `b=0.2`.
2. `bm25def`: standard BM25 with `k1=0.9` and `b=0.4`.
3. `bm25rm3`: BM25+RM3 reranking with `k1=1.2` and `b=0.3`, and `fbt=1`, `fbd=10` and `w=0.9`.
4. `bm25rm3def`: BM25+RM3 reranking with `k1=0.9` and `b=0.4`, and `fbt=10`, `fbd=10` and `w=0.5`.
5. `bm25prf`: BM25PRF with `k1=1.2` and `b=0.3`, and `fbt=5`, `fbd=10`, `w=0.2`, `k1=1.2` and `b=0.3`.

In [4]:
bm25new = get_system_metrics('../results/bm25var/output/res_k1.6_b0.2_251-300.txt', 'NDCG20')
bm25def = get_system_metrics('../results/bm25var/output/res_k0.9_b0.4_251-300.txt', 'NDCG20')
bm25rm3 = get_system_metrics('../results/bm25rm3/output/res_k1.2_b0.3_fbt1_fbd10_w0.9_251-300.txt', 'NDCG20')
bm25rm3def = get_system_metrics('../results/bm25rm3/output/res_k0.9_b0.4_fbt10_fbd10_w0.5_251-300.txt', 'NDCG20')
bm25prf = get_system_metrics('../results/bm25prf/output/res_k1.2_b0.3_fbt5_fbd10_w0.2_251-300.txt', 'NDCG20')

In [5]:
sys_mat = create_system_matrix(bm25new, bm25def, bm25rm3, bm25rm3def, bm25prf)

In [6]:
p_vals = randomized_tukey_hsd(sys_mat, 5000)

In [7]:
print_p_vals(p_vals)

Systems (1, 2): p value 0.9834
Systems (1, 3): p value 1.0
Systems (1, 4): p value 0.229
Systems (1, 5): p value 0.0026
Systems (2, 3): p value 0.989
Systems (2, 4): p value 0.532
Systems (2, 5): p value 0.0262
Systems (3, 4): p value 0.2516
Systems (3, 5): p value 0.0034
Systems (4, 5): p value 0.6532


Thus there is significant difference (`alpha=0.05`) between (1,5), (2,5), (3,5), which are:

- bm25new - bm25prf
- bm25def - bm25prf
- bm25rm3 - bm25prf