In [38]:
import json

In [39]:
#load the affected queries

affected_queries_json = open("/home/manuel/Tesi/Codebase/ARC/files/affected_queries.json", "r")
affected_queries_with_datasets = json.load(affected_queries_json)
affected_queries = affected_queries_with_datasets.keys()
affected_queries = set(affected_queries)
print(f"Affected queries: {len(affected_queries)}")

#load the empty datasets that are in the affected query ranks 
empty_datasets_in_affected_queries = set()
for query, empty in affected_queries_with_datasets.items():
    empty_datasets_in_affected_queries.update(empty)

empty_datasets = set(empty_datasets_in_affected_queries)
print(f"Number of empty datasets in affected queries: {len(empty_datasets)}")

Affected queries: 210
Number of empty datasets in affected queries: 341


In [40]:
"""
@param path_run: string with the path of the run file
@return dict, key: affected query id, value: rank of the query (ordered list)
"""
def returnRankAffectedQueries(path_run: str) -> dict: 
    run = open(path_run, "r")

    ranks = dict()

    while True:
        line = run.readline()

        if not line:
            break
        
        split = line.split("\t")

        query_id = split[0]

        if query_id in affected_queries:
            rank = set()
            rank.add(split[2])
        
            while True:
                previous_position = run.tell()

                ln = run.readline()

                if not ln:
                    break

                splt = ln.split("\t")
                actual_query = splt[0]

                if actual_query == query_id:
                    rank.add(splt[2])
                else:
                    run.seek(previous_position)
                    break

            ranks[query_id] = rank 

    run.close()

    return ranks


In [55]:
#recover the ACORDAR and my ranks for every dataset-empty affected query

path_acordar_run = "/home/manuel/Tesi/Codebase/ARC/run/ACORDAR/BM25F.txt"
path_run = "/home/manuel/Tesi/Codebase/ARC/run/EDS/BM25[m+c].txt"

acordar_ranks = returnRankAffectedQueries(path_acordar_run)
our_ranks = returnRankAffectedQueries(path_run)

print(len(acordar_ranks.keys()))
print(len(our_ranks.keys()))


210
210


In [56]:
#from the previous extracted dict (affected query -> rank) for ACORDAR we extract from the rank only the list of empty datasets
#returned from ACORDAR

empty_datasets_in_acordar_queries = dict()

i=0
for query, rank in acordar_ranks.items():
    empty_datasets_returned = set.intersection(rank, empty_datasets)
    if len(empty_datasets_returned) > 0:
        print(f"ACORDAR returns empty datasets in query: {query}, datasets: {empty_datasets_returned}")
        empty_datasets_in_acordar_queries[query] = empty_datasets_returned

        i+=1
print(i)


ACORDAR returns empty datasets in query: 3, datasets: {'25039', '25054'}
ACORDAR returns empty datasets in query: 22, datasets: {'2690'}
ACORDAR returns empty datasets in query: 28, datasets: {'24355'}
ACORDAR returns empty datasets in query: 80, datasets: {'88119', '88100'}
ACORDAR returns empty datasets in query: 108, datasets: {'11285'}
ACORDAR returns empty datasets in query: 112, datasets: {'63268', '29436', '63269', '11285'}
ACORDAR returns empty datasets in query: 134, datasets: {'85975', '55976'}
ACORDAR returns empty datasets in query: 175, datasets: {'11789'}
ACORDAR returns empty datasets in query: 194, datasets: {'24751'}
ACORDAR returns empty datasets in query: 206, datasets: {'28924', '28732', '28671'}
ACORDAR returns empty datasets in query: 1004, datasets: {'15779', '15753', '15771', '15786'}
ACORDAR returns empty datasets in query: 1028, datasets: {'41454'}
ACORDAR returns empty datasets in query: 1036, datasets: {'2206'}
ACORDAR returns empty datasets in query: 1041, 

In [57]:
#for every query affected by empty datasets, return which datasets I don't return and ACORDAR does instead

not_returned_datasets = dict()

for query, empty_datasets_acordar in empty_datasets_in_acordar_queries.items():

    if not empty_datasets_acordar.issubset(our_ranks[query]):
        datasets = set()

        for empty_dataset in empty_datasets_acordar:
            if empty_dataset not in our_ranks[query]:
                datasets.add(empty_dataset)
        
        not_returned_datasets[query] = datasets

        print(f"Query: {query}, not returned: {datasets}")
 
print(f"Number of queries where i don't return empty datasets: {len(not_returned_datasets.keys())}")

Query: 22, not returned: {'2690'}
Query: 28, not returned: {'24355'}
Query: 80, not returned: {'88119', '88100'}
Query: 108, not returned: {'11285'}
Query: 112, not returned: {'63268', '63269', '11285', '29436'}
Query: 134, not returned: {'55976'}
Query: 175, not returned: {'11789'}
Query: 194, not returned: {'24751'}
Query: 206, not returned: {'28924', '28732'}
Query: 1004, not returned: {'15786', '15753', '15779', '15771'}
Query: 1028, not returned: {'41454'}
Query: 1036, not returned: {'2206'}
Query: 1041, not returned: {'55459'}
Query: 1065, not returned: {'14509', '13349'}
Query: 1074, not returned: {'13290'}
Query: 1078, not returned: {'15803', '3786'}
Query: 1095, not returned: {'12515'}
Query: 1124, not returned: {'77345'}
Query: 1149, not returned: {'2513'}
Query: 1160, not returned: {'2187', '2171', '2124', '2065', '2263'}
Query: 1170, not returned: {'9461', '10436', '43154'}
Query: 1175, not returned: {'69526'}
Query: 1179, not returned: {'41076'}
Query: 1213, not returned: 

In [60]:
#visualize the most affected queries for my run, so the queries with a lot of empty datasets that are not returned
#by sorting the queries by the len of the list of the empty datasets not returned

not_returned_datasets_sorted = sorted(not_returned_datasets.items(), key=lambda x:len(x[1]), reverse= True)
print(not_returned_datasets_sorted)

[('1182', {'2079', '2187', '1998', '2171', '2270', '2124', '2048', '2065', '2567', '2263'}), ('10', {'85047', '82746', '82631', '83073', '82816', '85313', '83577'}), ('1106', {'11831', '41132', '41092', '41136', '9776', '41073', '7542'}), ('1238', {'2243', '3108', '42796', '2945', '42845', '24005', '2987'}), ('1160', {'2187', '2171', '2124', '2065', '2263'}), ('1042', {'15786', '15743', '15771', '15788', '15779'}), ('1203', {'969', '1103', '961', '4053', '1859'}), ('37', {'3100', '3143', '43652', '2754', '44450'}), ('158', {'15250', '3106', '13815', '63268', '14735'}), ('112', {'63268', '63269', '11285', '29436'}), ('1004', {'15786', '15753', '15779', '15771'}), ('1219', {'25119', '85767', '25048', '1986'}), ('50', {'2218', '24610', '15636', '24483'}), ('243', {'10076', '42120', '42278', '10040'}), ('1039', {'49190', '2520', '49055', '2307'}), ('57', {'84877', '83073', '82746', '85524'}), ('1052', {'41087', '41136', '41132', '11795'}), ('1170', {'9461', '10436', '43154'}), ('189', {'83

In [61]:
#calculate how many queries has 1 empty dataset not returned, 2 and so on

buckets = dict()

for query, empty_datasets_not_returned in not_returned_datasets_sorted:
    length = len(empty_datasets_not_returned)

    if length not in buckets.keys():
        buckets[length] = []

    buckets[length].append(query)

for bucket, query_list in buckets.items():
    print(f"Number of empty datasets not returned: {bucket} Number of queries: {len(query_list)}")


Number of empty datasets not returned: 10 Number of queries: 1
Number of empty datasets not returned: 7 Number of queries: 3
Number of empty datasets not returned: 5 Number of queries: 5
Number of empty datasets not returned: 4 Number of queries: 8
Number of empty datasets not returned: 3 Number of queries: 12
Number of empty datasets not returned: 2 Number of queries: 30
Number of empty datasets not returned: 1 Number of queries: 64


In [62]:
"""
@param trec_file file to a P@10 info file
@return dict: key: query id value: P@10
"""
def recoverAllP10(trec_file):
    query_p10 = dict()

    while True:
        line = trec_file.readline()

        if not line:
            break

        split = line.split()
        query = split[1]
        p10 = split[2]

        query_p10[query] = p10
    
    return query_p10
    

In [63]:
"""
@param trec_file file to a P@10 info file
@parma affected_queries set of queries ids (144 queries where I don't return some empty datasets)
@return dict: key: query id value: P@10
"""
def recoverP10AffectedFromTrec(trec_file, affected_queries):
    query_p10 = dict()

    while True:
        line = trec_file.readline()

        if not line:
            break

        split = line.split()
        query = split[1]
        p10 = split[2]

        if query in affected_queries:
            query_p10[query] = p10
    
    return query_p10
    

In [64]:
# recover for every empty datasets affected query the P@10 obtained from ACORDAR runs and from my runs

acordar_trec = open("/home/manuel/Tesi/Codebase/ARC/run/ACORDAR/BM25_P10.txt", "r")
run_trec = open("/home/manuel/Tesi/Codebase/ARC/run/EDS/BM25_P10.txt", "r")

acordar_P10 = recoverP10AffectedFromTrec(acordar_trec, not_returned_datasets.keys())
run_P10 = recoverP10AffectedFromTrec(run_trec, not_returned_datasets.keys())

comparison = dict()

i = 0 
for query, p10 in acordar_P10.items():
    features = dict()
    features["P10 (A)"] = float(p10)
    features["P10 (E)"] = float(run_P10[query])
    features["len"] = len(not_returned_datasets[query])
                    
    print(f"Query ID: {query} P10 (A): {p10} P10 (E): {run_P10[query]} Not returned empty datasets: {len(not_returned_datasets[query])}")
    comparison[query] = features
    i+=1
print(i)

Query ID: 10 P10 (A): 0.7000 P10 (E): 0.0000 Not returned empty datasets: 7
Query ID: 1004 P10 (A): 0.7000 P10 (E): 0.6000 Not returned empty datasets: 4
Query ID: 1006 P10 (A): 0.6000 P10 (E): 0.1000 Not returned empty datasets: 1
Query ID: 1007 P10 (A): 1.0000 P10 (E): 0.4000 Not returned empty datasets: 2
Query ID: 1012 P10 (A): 0.3000 P10 (E): 0.2000 Not returned empty datasets: 2
Query ID: 1013 P10 (A): 0.6000 P10 (E): 0.4000 Not returned empty datasets: 1
Query ID: 1014 P10 (A): 1.0000 P10 (E): 0.7000 Not returned empty datasets: 2
Query ID: 1020 P10 (A): 0.2000 P10 (E): 0.0000 Not returned empty datasets: 2
Query ID: 1024 P10 (A): 0.3000 P10 (E): 0.0000 Not returned empty datasets: 3
Query ID: 1025 P10 (A): 0.1000 P10 (E): 0.0000 Not returned empty datasets: 1
Query ID: 1028 P10 (A): 0.8000 P10 (E): 0.6000 Not returned empty datasets: 1
Query ID: 1033 P10 (A): 1.0000 P10 (E): 0.2000 Not returned empty datasets: 3
Query ID: 1034 P10 (A): 0.3000 P10 (E): 0.0000 Not returned empty 

In [65]:
run_trec = open("/home/manuel/Tesi/Codebase/ARC/run/EDS/BM25_P10.txt", "r")
run_all_P10 = recoverAllP10(run_trec)

In [67]:
i = 0 
for query, features in comparison.items():
    recover = (features["P10 (E)"] + (features["len"] / 10 ))

    if recover <= features["P10 (A)"]:
        print(f"Query ID: {query} P10 (A): {features['P10 (A)']} P10 (E): {features['P10 (E)']} Not returned empty datasets: {features['len']}")
        i+=1

print(i)

Query ID: 10 P10 (A): 0.7 P10 (E): 0.0 Not returned empty datasets: 7
Query ID: 1006 P10 (A): 0.6 P10 (E): 0.1 Not returned empty datasets: 1
Query ID: 1007 P10 (A): 1.0 P10 (E): 0.4 Not returned empty datasets: 2
Query ID: 1013 P10 (A): 0.6 P10 (E): 0.4 Not returned empty datasets: 1
Query ID: 1014 P10 (A): 1.0 P10 (E): 0.7 Not returned empty datasets: 2
Query ID: 1020 P10 (A): 0.2 P10 (E): 0.0 Not returned empty datasets: 2
Query ID: 1024 P10 (A): 0.3 P10 (E): 0.0 Not returned empty datasets: 3
Query ID: 1025 P10 (A): 0.1 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1028 P10 (A): 0.8 P10 (E): 0.6 Not returned empty datasets: 1
Query ID: 1033 P10 (A): 1.0 P10 (E): 0.2 Not returned empty datasets: 3
Query ID: 1034 P10 (A): 0.3 P10 (E): 0.0 Not returned empty datasets: 1
Query ID: 1036 P10 (A): 0.6 P10 (E): 0.1 Not returned empty datasets: 1
Query ID: 1038 P10 (A): 1.0 P10 (E): 0.9 Not returned empty datasets: 1
Query ID: 1040 P10 (A): 0.7 P10 (E): 0.5 Not returned empty datase

In [68]:
# calculate the new P@10 if the empty datasets were returned

new_run_all_P10 = run_all_P10.copy()

i = 0 
for query, features in comparison.items():
    recover = (features["P10 (E)"] + (features["len"] / 10 ))

    if recover <= features["P10 (A)"]:
        new_run_all_P10[query] = recover


In [69]:
new_P10 = 0 
for query, p10 in new_run_all_P10.items():
    new_P10 += float(p10)
new_P10 /= 493

In [72]:
new_P10

0.2808215010141983