In [1]:
FILE_PATH = "results_trec_covid.txt"

In [2]:
import pandas as pd
import re

# Read the file
with open(FILE_PATH, "r") as f:
    lines = f.readlines()

# Initialize a dictionary to store parsed data
data = {"Subset": [], "nDCG@10": [], "MAP": [], "MRR": []}

# Temporary variables
subset = None

# Iterate through lines to extract information
for line in lines:
    line = line.strip()
    if line.startswith("Subset"):
        subset = int(re.search(r'\d+', line).group())  # Extract subset number
    elif line.startswith("nDCG@10"):
        ndcg = float(line.split(":")[1].strip())
    elif line.startswith("MAP"):
        map_val = float(line.split(":")[1].strip())
    elif line.startswith("MRR"):
        mrr = float(line.split(":")[1].strip())
        # Append extracted values to dictionary
        data["Subset"].append(subset)
        data["nDCG@10"].append(ndcg)
        data["MAP"].append(map_val)
        data["MRR"].append(mrr)

# Convert dictionary to DataFrame
df = pd.DataFrame(data)

# Print DataFrame
print(df)

      Subset   nDCG@10       MAP       MRR
0          0  0.242830  0.120394  0.453432
1          1  0.265432  0.124798  0.470772
2          2  0.266056  0.125051  0.489842
3          3  0.249523  0.119131  0.475298
4          4  0.269700  0.122864  0.507224
...      ...       ...       ...       ...
1018    1018  0.247282  0.119172  0.442737
1019    1019  0.237910  0.118469  0.448955
1020    1020  0.247793  0.119102  0.467599
1021    1021  0.241111  0.119199  0.435077
1022    1022  0.240623  0.118414  0.463519

[1023 rows x 4 columns]


In [3]:
import itertools

paraphrases = [
    "Improve the search effectiveness by suggesting expansion terms for the query",
    "Recommend expansion terms for the query to improve search results",
    "Improve the search effectiveness by suggesting useful expansion terms for the query",
    "Maximize search utility by suggesting relevant expansion phrases for the query",
    "Enhance search efficiency by proposing valuable terms to expand the query",
    "Elevate search performance by recommending relevant expansion phrases for the query",
    "Boost the search accuracy by providing helpful expansion terms to enrich the query",
    "Increase the search efficacy by offering beneficial expansion keywords for the query",
    "Optimize search results by suggesting meaningful expansion terms to enhance the query",
    "Enhance search outcomes by recommending beneficial expansion terms to supplement the query"
]
all_paraphrase_subsets = [subset for L in range(1, len(paraphrases) + 1) for subset in itertools.combinations(paraphrases, L)]

def subset_num_to_paraphrases(num):
    return all_paraphrase_subsets[num]

In [4]:
df_sorted = df.sort_values(by='nDCG@10', ascending=False)
print('Highest nDCG@10 score:\n')
print(df_sorted.head(1))

print('\nSubset:')
print(subset_num_to_paraphrases(int(df_sorted.iloc[0]['Subset'])))

Highest nDCG@10 score:

     Subset   nDCG@10       MAP       MRR
279     279  0.282856  0.121228  0.527583

Subset:
('Recommend expansion terms for the query to improve search results', 'Improve the search effectiveness by suggesting useful expansion terms for the query', 'Optimize search results by suggesting meaningful expansion terms to enhance the query', 'Enhance search outcomes by recommending beneficial expansion terms to supplement the query')


In [5]:
df_sorted = df.sort_values(by='MAP', ascending=False)
print('Highest MAP score:\n')
print(df_sorted.head(1))

print('\nSubset:')
print(subset_num_to_paraphrases(int(df_sorted.iloc[0]['Subset'])))

Highest MAP score:

   Subset   nDCG@10       MAP       MRR
2       2  0.266056  0.125051  0.489842

Subset:
('Improve the search effectiveness by suggesting useful expansion terms for the query',)


In [6]:
df_sorted = df.sort_values(by='MRR', ascending=False)
print('Highest MRR score:\n')
print(df_sorted.head(1))

print('\nSubset:')
print(subset_num_to_paraphrases(int(df_sorted.iloc[0]['Subset'])))

Highest MRR score:

     Subset   nDCG@10      MAP       MRR
588     588  0.266185  0.12184  0.571897

Subset:
('Improve the search effectiveness by suggesting useful expansion terms for the query', 'Maximize search utility by suggesting relevant expansion phrases for the query', 'Enhance search efficiency by proposing valuable terms to expand the query', 'Increase the search efficacy by offering beneficial expansion keywords for the query', 'Optimize search results by suggesting meaningful expansion terms to enhance the query')


In [17]:
# Avg per subset size

def filter_condition(subset_num, subset_size):
    return len(subset_num_to_paraphrases(subset_num)) == subset_size

all_averages = []

for i in range(1, 11):
    averages = {}
    averages["Subset size"] = i
    filtered_df = df[df["Subset"].apply(lambda x: filter_condition(x, i))]
    averages["nDCG@10"] = filtered_df["nDCG@10"].mean()
    averages["MAP"] = filtered_df["MAP"].mean()
    averages["MRR"] = filtered_df["MRR"].mean()
    all_averages.append(averages)

print(pd.DataFrame(all_averages))

   Subset size   nDCG@10       MAP       MRR
0            1  0.257778  0.122727  0.478550
1            2  0.254407  0.120821  0.474098
2            3  0.252986  0.120016  0.473713
3            4  0.252247  0.119658  0.477231
4            5  0.250234  0.119356  0.473783
5            6  0.247159  0.119213  0.471845
6            7  0.244789  0.119017  0.463980
7            8  0.244294  0.118856  0.459407
8            9  0.242155  0.118635  0.452462
9           10  0.240623  0.118414  0.463519


In [20]:
# Avg per paraphrase

def filter_condition(subset_num, paraphrase):
    if paraphrase in subset_num_to_paraphrases(subset_num):
        return True
    return False

all_averages = []

for i in range(0, 10):
    averages = {}
    averages["Paraphrase"] = paraphrases[i]
    filtered_df = df[df["Subset"].apply(lambda x: filter_condition(x, paraphrases[i]))]
    averages["nDCG@10"] = filtered_df["nDCG@10"].mean()
    averages["MAP"] = filtered_df["MAP"].mean()
    averages["MRR"] = filtered_df["MRR"].mean()
    all_averages.append(averages)

print(pd.DataFrame(all_averages))

                                          Paraphrase   nDCG@10       MAP  \
0  Improve the search effectiveness by suggesting...  0.247514  0.118850   
1  Recommend expansion terms for the query to imp...  0.249153  0.119355   
2  Improve the search effectiveness by suggesting...  0.251727  0.119402   
3  Maximize search utility by suggesting relevant...  0.247432  0.118499   
4  Enhance search efficiency by proposing valuabl...  0.248368  0.119629   
5  Elevate search performance by recommending rel...  0.249150  0.118980   
6  Boost the search accuracy by providing helpful...  0.247564  0.119698   
7  Increase the search efficacy by offering benef...  0.247129  0.119961   
8  Optimize search results by suggesting meaningf...  0.248490  0.119679   
9  Enhance search outcomes by recommending benefi...  0.249647  0.119387   

        MRR  
0  0.470852  
1  0.465177  
2  0.476066  
3  0.473483  
4  0.473474  
5  0.468137  
6  0.460090  
7  0.469728  
8  0.483019  
9  0.468652  
