In [1]:
import collections
import numpy as np
import pickle


In [10]:
# Paths to TREC run files
experiments = {'BM25':'TO_BE_SET',
               'BERT_Base':'TO_BE_SET'
               }

docs_bias_paths = {'tc':"data/msmarco_passage/docs_bias_tc.pkl",
                   'tf':"data/msmarco_passage/docs_bias_tf.pkl",
                   'bool':"data/msmarco_passage/docs_bias_bool.pkl",
                   }

at_ranklist = [5, 10, 20, 30, 40]

queries_gender_annotated_path = "resources/queries_gender_annotated.csv"

save_path_base = "data/msmarco_passage"

In [3]:
#Loading saved document bias values
docs_bias = {}
for _method in docs_bias_paths:
    print (_method)
    with open(docs_bias_paths[_method], 'rb') as fr:
        docs_bias[_method] = pickle.load(fr)


tc
tf
bool


In [11]:
#Loading gender annotated queries
qryids_filter = []
with open(queries_gender_annotated_path, 'r') as fr:
    for line in fr:
        vals = line.strip().split(',')
        qryid = int(vals[0])
        qryids_filter.append(qryid)

qryids_filter = set(qryids_filter)
print (len(qryids_filter))

3750


In [17]:
#Loading run files

runs_docs_bias = {}
    
for exp_name in experiments:
    
    run_path = experiments[exp_name]
    runs_docs_bias[exp_name] = {}
    
    for _method in docs_bias_paths:
        runs_docs_bias[exp_name][_method] = {}
    
    with open(run_path) as fr:
        qryid_cur = 0
        for i, line in enumerate(fr):
            vals = line.strip().split(' ')
            if len(vals) == 6:
                qryid = int(vals[0])
                docid = int(vals[2])

                if (qryid not in qryids_filter):
                    continue
                
                if qryid != qryid_cur:
                    for _method in docs_bias_paths:
                        runs_docs_bias[exp_name][_method][qryid] = []
                    qryid_cur = qryid
                for _method in docs_bias_paths:
                    runs_docs_bias[exp_name][_method][qryid].append(docs_bias[_method][docid])
      
    for _method in docs_bias_paths:
        print ("Number of effective queries in %s using %s : %d" % (exp_name, _method, len(runs_docs_bias[exp_name][_method].keys())))
    print ()
print ('done!')



Number of effective queries in BM25 using tc : 215
Number of effective queries in BM25 using tf : 215
Number of effective queries in BM25 using bool : 215

Number of effective queries in BERT_Base using tc : 215
Number of effective queries in BERT_Base using tf : 215
Number of effective queries in BERT_Base using bool : 215

done!


In [18]:
def calc_RaB_q(bias_list, at_rank):
    bias_val = np.mean([x[0] for x in bias_list[:at_rank]])
    bias_feml_val = np.mean([x[1] for x in bias_list[:at_rank]])
    bias_male_val = np.mean([x[2] for x in bias_list[:at_rank]])
    
    return bias_val, bias_feml_val, bias_male_val
       
    
def calc_ARaB_q(bias_list, at_rank):
    
    _vals = []
    _feml_vals = []
    _male_vals = []
    for t in range(at_rank):
        if len(bias_list) >= t+1:
            _val_RaB, _feml_val_RaB, _male_val_RaB = calc_RaB_q(bias_list, t+1)
            _vals.append(_val_RaB)
            _feml_vals.append(_feml_val_RaB)
            _male_vals.append(_male_val_RaB)

    bias_val = np.mean(_vals)
    bias_feml_val = np.mean(_feml_vals)
    bias_male_val = np.mean(_male_vals)
    
    return bias_val, bias_feml_val, bias_male_val

_test = [(0.0, 0.0, 0.0),(3, 3, 0.0),(0, 0, 0.0),(0, 0, 0.0),(0, 0, 0.0),(0, 0, 0.0),(0, 0.0, 0.0),(-5, 0.0, 5),(0, 0.0, 0.0),(-2, 0.0, 2)]

print ('RaB_q', calc_RaB_q(_test, 10))
print ('ARaB_q', calc_ARaB_q(_test, 10))


RaB_q (-0.4, 0.3, 0.7)
ARaB_q (0.3906349206349206, 0.5786904761904761, 0.18805555555555556)


In [21]:
qry_bias_RaB = {}
qry_bias_ARaB = {}
     
print ('Calculating ranking bias ...')

        
for exp_name in experiments:
    qry_bias_RaB[exp_name] = {}
    qry_bias_ARaB[exp_name] = {}


    for _method in docs_bias_paths:
        print (exp_name, _method)

        qry_bias_RaB[exp_name][_method] = {}
        qry_bias_ARaB[exp_name][_method] = {}

        for at_rank in at_ranklist:
            qry_bias_RaB[exp_name][_method][at_rank] = {}
            qry_bias_ARaB[exp_name][_method][at_rank] = {}

            for qry_id in runs_docs_bias[exp_name][_method]:
                qry_bias_RaB[exp_name][_method][at_rank][qry_id] = calc_RaB_q(runs_docs_bias[exp_name][_method][qry_id], at_rank)
                qry_bias_ARaB[exp_name][_method][at_rank][qry_id] = calc_ARaB_q(runs_docs_bias[exp_name][_method][qry_id], at_rank)
    
print ('done!')


Calculating ranking bias ...
BM25 tc
BM25 tf
BM25 bool
BERT_Base tc
BERT_Base tf
BERT_Base bool
done!


In [23]:
for exp_name in experiments:
    for _method in docs_bias_paths:
        save_path = save_path_base + "/run_bias_%s_%s" % (exp_name, _method)

        print (save_path)

        with open(save_path + '_RaB.pkl', 'wb') as fw:
            pickle.dump(qry_bias_RaB[exp_name][_method], fw)

        with open(save_path + '_ARaB.pkl', 'wb') as fw:
            pickle.dump(qry_bias_ARaB[exp_name][_method], fw)

    

data/msmarco_passage/run_bias_BM25_tc
data/msmarco_passage/run_bias_BM25_tf
data/msmarco_passage/run_bias_BM25_bool
data/msmarco_passage/run_bias_BERT_Base_tc
data/msmarco_passage/run_bias_BERT_Base_tf
data/msmarco_passage/run_bias_BERT_Base_bool
