In [88]:
import json

In [89]:
#load the affected queries

affected_queries_json = open("/home/manuel/Tesi/Codebase/ARC/files/affected_queries.json", "r")
affected_queries_with_datasets = json.load(affected_queries_json)
affected_queries = affected_queries_with_datasets.keys()
affected_queries = set(affected_queries)
print(f"Affected queries: {len(affected_queries)}")

#load the empty datasets that are in the affected query ranks 
empty_datasets_in_affected_queries = set()
for query, empty in affected_queries_with_datasets.items():
    empty_datasets_in_affected_queries.update(empty)

empty_datasets = set(empty_datasets_in_affected_queries)
print(f"Number of empty datasets in affected queries: {len(empty_datasets)}")

Affected queries: 210
Number of empty datasets in affected queries: 341


In [90]:
"""
@param path_run: string with the path of the run file
@return dict, key: affected query id, value: rank of the query (ordered list)
"""
def returnRankAffectedQueries(path_run: str) -> dict: 
    run = open(path_run, "r")

    ranks = dict()

    while True:
        line = run.readline()

        if not line:
            break
        
        split = line.split("\t")

        query_id = split[0]

        if query_id in affected_queries:
            rank = set()
            rank.add(split[2])
        
            while True:
                previous_position = run.tell()

                ln = run.readline()

                if not ln:
                    break

                splt = ln.split("\t")
                actual_query = splt[0]

                if actual_query == query_id:
                    rank.add(splt[2])
                else:
                    run.seek(previous_position)
                    break

            ranks[query_id] = rank 

    run.close()

    return ranks


In [91]:
#recover the ACORDAR and my ranks for every dataset-empty affected query

path_acordar_run = "/home/manuel/Tesi/Codebase/ARC/run/ACORDAR/BM25F[d].txt"
path_run = "/home/manuel/Tesi/Codebase/ARC/run/EDS/BM25[c].txt"

acordar_ranks = returnRankAffectedQueries(path_acordar_run)
our_ranks = returnRankAffectedQueries(path_run)

print(len(acordar_ranks.keys()))
print(len(our_ranks.keys()))


210
210


In [92]:
#from the previous extracted dict (affected query -> rank) for ACORDAR we extract from the rank only the list of empty datasets
#returned from ACORDAR

empty_datasets_in_acordar_queries = dict()

i=0
d = 0 
for query, rank in acordar_ranks.items():
    empty_datasets_returned = set.intersection(rank, empty_datasets)
    if len(empty_datasets_returned) > 0:
        print(f"ACORDAR returns empty datasets in query: {query}, datasets: {empty_datasets_returned}")
        empty_datasets_in_acordar_queries[query] = empty_datasets_returned
        d += len(empty_datasets_returned)
        i+=1
print("Numer of queries:"+str(i))
print("Numer of empty datasets:"+str(d))


ACORDAR returns empty datasets in query: 3, datasets: {'2175', '15636'}
ACORDAR returns empty datasets in query: 22, datasets: {'2690'}
ACORDAR returns empty datasets in query: 28, datasets: {'24355'}
ACORDAR returns empty datasets in query: 108, datasets: {'63268'}
ACORDAR returns empty datasets in query: 134, datasets: {'85975'}
ACORDAR returns empty datasets in query: 148, datasets: {'10658'}
ACORDAR returns empty datasets in query: 175, datasets: {'4032'}
ACORDAR returns empty datasets in query: 194, datasets: {'24751'}
ACORDAR returns empty datasets in query: 1004, datasets: {'15771', '15779', '15786', '15753'}
ACORDAR returns empty datasets in query: 1053, datasets: {'13255', '15234'}
ACORDAR returns empty datasets in query: 1055, datasets: {'12515'}
ACORDAR returns empty datasets in query: 1065, datasets: {'13349', '14509'}
ACORDAR returns empty datasets in query: 1078, datasets: {'15803'}
ACORDAR returns empty datasets in query: 1095, datasets: {'12515'}
ACORDAR returns empty d

In [93]:
#for every query affected by empty datasets, return which datasets I don't return and ACORDAR does instead

not_returned_datasets = dict()
d = 0

for query, empty_datasets_acordar in empty_datasets_in_acordar_queries.items():

    if not empty_datasets_acordar.issubset(our_ranks[query]):
        datasets = set()

        for empty_dataset in empty_datasets_acordar:
            if empty_dataset not in our_ranks[query]:
                datasets.add(empty_dataset)
        
        not_returned_datasets[query] = datasets

        d += len(datasets)

        print(f"Query: {query}, not returned: {datasets}")
 
print(f"Number of queries where i don't return empty datasets: {len(not_returned_datasets.keys())}")
print(f"Not returned datasets: {d}")

Query: 3, not returned: {'2175', '15636'}
Query: 22, not returned: {'2690'}
Query: 28, not returned: {'24355'}
Query: 108, not returned: {'63268'}
Query: 134, not returned: {'85975'}
Query: 148, not returned: {'10658'}
Query: 175, not returned: {'4032'}
Query: 194, not returned: {'24751'}
Query: 1004, not returned: {'15771', '15779', '15786', '15753'}
Query: 1053, not returned: {'13255', '15234'}
Query: 1055, not returned: {'12515'}
Query: 1065, not returned: {'13349', '14509'}
Query: 1078, not returned: {'15803'}
Query: 1095, not returned: {'12515'}
Query: 1170, not returned: {'10436'}
Query: 1175, not returned: {'69526'}
Query: 1179, not returned: {'11285'}
Query: 50, not returned: {'15636'}
Query: 87, not returned: {'28732'}
Query: 197, not returned: {'73056'}
Query: 199, not returned: {'7885', '73725', '73674', '3106'}
Query: 238, not returned: {'9219'}
Query: 248, not returned: {'83391'}
Query: 1012, not returned: {'14735'}
Query: 1103, not returned: {'8722'}
Query: 1132, not retu

In [94]:
#visualize the most affected queries for my run, so the queries with a lot of empty datasets that are not returned
#by sorting the queries by the len of the list of the empty datasets not returned

not_returned_datasets_sorted = sorted(not_returned_datasets.items(), key=lambda x:len(x[1]), reverse= True)
print(not_returned_datasets_sorted)

[('158', {'14735', '15250', '13815', '63268', '3106'}), ('1004', {'15771', '15779', '15786', '15753'}), ('199', {'7885', '73725', '73674', '3106'}), ('51', {'73079', '73048', '73029', '73133'}), ('1201', {'15781', '15715', '15767', '15803'}), ('213', {'7885', '73725', '8092'}), ('1037', {'3106', '2126', '29330'}), ('1093', {'13349', '14509', '13043'}), ('1052', {'41092', '15636', '9776'}), ('1241', {'11789', '13349', '14509'}), ('3', {'2175', '15636'}), ('1053', {'13255', '15234'}), ('1065', {'13349', '14509'}), ('152', {'11285', '29436'}), ('1021', {'13349', '14509'}), ('1106', {'41092', '9776'}), ('1109', {'2678', '49816'}), ('1126', {'2175', '15803'}), ('1234', {'24751', '69820'}), ('57', {'73056', '85524'}), ('75', {'3106', '29330'}), ('252', {'13907', '15270'}), ('1063', {'13349', '14509'}), ('26', {'13349', '14509'}), ('1027', {'13255', '15234'}), ('22', {'2690'}), ('28', {'24355'}), ('108', {'63268'}), ('134', {'85975'}), ('148', {'10658'}), ('175', {'4032'}), ('194', {'24751'})

In [95]:
#calculate how many queries has 1 empty dataset not returned, 2 and so on

buckets = dict()

for query, empty_datasets_not_returned in not_returned_datasets_sorted:
    length = len(empty_datasets_not_returned)

    if length not in buckets.keys():
        buckets[length] = []

    buckets[length].append(query)

for bucket, query_list in buckets.items():
    print(f"Number of empty datasets not returned: {bucket} Number of queries: {len(query_list)}")


Number of empty datasets not returned: 5 Number of queries: 1
Number of empty datasets not returned: 4 Number of queries: 4
Number of empty datasets not returned: 3 Number of queries: 5
Number of empty datasets not returned: 2 Number of queries: 15
Number of empty datasets not returned: 1 Number of queries: 49


In [96]:
"""
@param trec_file file to a P@10 info file
@return dict: key: query id value: P@10
"""
def recoverAllP10(trec_file):
    query_p10 = dict()

    while True:
        line = trec_file.readline()

        if not line:
            break

        split = line.split()
        query = split[1]
        p10 = split[2]

        query_p10[query] = p10
    
    return query_p10
    

In [97]:
"""
@param trec_file file to a P@10 info file
@parma affected_queries set of queries ids (144 queries where I don't return some empty datasets)
@return dict: key: query id value: P@10
"""
def recoverP10AffectedFromTrec(trec_file, affected_queries):
    query_p10 = dict()

    while True:
        line = trec_file.readline()

        if not line:
            break

        split = line.split()
        query = split[1]
        p10 = split[2]

        if query in affected_queries:
            query_p10[query] = p10
    
    return query_p10
    

In [98]:
# recover for every empty datasets affected query the P@10 obtained from ACORDAR runs and from my runs

acordar_trec = open("/home/manuel/Tesi/Codebase/ARC/run/ACORDAR/BM25F[d]_P10.txt", "r")
run_trec = open("/home/manuel/Tesi/Codebase/ARC/run/EDS/BM25[c]_P10.txt", "r")

acordar_P10 = recoverP10AffectedFromTrec(acordar_trec, not_returned_datasets.keys())
run_P10 = recoverP10AffectedFromTrec(run_trec, not_returned_datasets.keys())

comparison = dict()

i = 0 
for query, p10 in acordar_P10.items():
    features = dict()
    features["P10 (A)"] = float(p10)
    features["P10 (E)"] = float(run_P10[query])
    features["len"] = len(not_returned_datasets[query])
                    
    print(f"Query ID: {query} P10 (A): {p10} P10 (E): {run_P10[query]} Not returned empty datasets: {len(not_returned_datasets[query])}")
    comparison[query] = features
    i+=1
print(i)

Query ID: 1004 P10 (A): 0.6000 P10 (E): 0.6000 Not returned empty datasets: 4
Query ID: 1012 P10 (A): 0.2000 P10 (E): 0.2000 Not returned empty datasets: 1
Query ID: 1021 P10 (A): 0.2000 P10 (E): 0.0000 Not returned empty datasets: 2
Query ID: 1024 P10 (A): 0.0000 P10 (E): 0.0000 Not returned empty datasets: 1
Query ID: 1027 P10 (A): 0.2000 P10 (E): 0.2000 Not returned empty datasets: 2
Query ID: 1037 P10 (A): 0.2000 P10 (E): 0.0000 Not returned empty datasets: 3
Query ID: 1039 P10 (A): 0.0000 P10 (E): 0.0000 Not returned empty datasets: 1
Query ID: 1052 P10 (A): 0.2000 P10 (E): 0.5000 Not returned empty datasets: 3
Query ID: 1053 P10 (A): 0.2000 P10 (E): 0.0000 Not returned empty datasets: 2
Query ID: 1055 P10 (A): 0.1000 P10 (E): 0.0000 Not returned empty datasets: 1
Query ID: 1063 P10 (A): 0.2000 P10 (E): 0.0000 Not returned empty datasets: 2
Query ID: 1065 P10 (A): 0.2000 P10 (E): 0.0000 Not returned empty datasets: 2
Query ID: 1078 P10 (A): 0.0000 P10 (E): 0.0000 Not returned empt

In [99]:
run_trec = open("/home/manuel/Tesi/Codebase/ARC/run/EDS/BM25[c]_P10.txt", "r")
run_all_P10 = recoverAllP10(run_trec)

In [100]:
i = 0 
for query, features in comparison.items():
    recover = (features["P10 (E)"] + (features["len"] / 10 ))

    if recover <= features["P10 (A)"]:
        print(f"Query ID: {query} P10 (A): {features['P10 (A)']} P10 (E): {features['P10 (E)']} Not returned empty datasets: {features['len']}")
        i+=1

print(i)

Query ID: 1021 P10 (A): 0.2 P10 (E): 0.0 Not returned empty datasets: 2
Query ID: 1053 P10 (A): 0.2 P10 (E): 0.0 Not returned empty datasets: 2
Query ID: 1055 P10 (A): 0.1 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1063 P10 (A): 0.2 P10 (E): 0.0 Not returned empty datasets: 2
Query ID: 1065 P10 (A): 0.2 P10 (E): 0.0 Not returned empty datasets: 2
Query ID: 108 P10 (A): 0.1 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1083 P10 (A): 0.4 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1086 P10 (A): 0.1 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1095 P10 (A): 0.1 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1109 P10 (A): 0.4 P10 (E): 0.1 Not returned empty datasets: 2
Query ID: 1138 P10 (A): 0.7 P10 (E): 0.1 Not returned empty datasets: 1
Query ID: 1164 P10 (A): 0.8 P10 (E): 0.6 Not returned empty datasets: 1
Query ID: 117 P10 (A): 0.1 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1170 P10 (A): 0.1 P10 (E): 0.0 Not returned empty datase

In [101]:
# calculate the new P@10 if the empty datasets were returned

new_run_all_P10 = run_all_P10.copy()

i = 0 
for query, features in comparison.items():
    recover = (features["P10 (E)"] + (features["len"] / 10 ))

    if recover <= features["P10 (A)"]:
        new_run_all_P10[query] = recover


In [102]:
new_P10 = 0 
for query, p10 in new_run_all_P10.items():
    new_P10 += float(p10)
new_P10 /= 493

In [103]:
new_P10

0.07945354969574055