In [1]:
!pip install python-terrier
!pip install transformers

import pyterrier as pt
from pathlib import Path



In [2]:
pt.init()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [6]:
from pathlib import Path

# Load BM25 Index using PyTerrier
#DATASET_NAME = "irds:beir/webis-touche2020"
DATASET_NAME = "trec-covid"

# Load Dataset
dataset = pt.datasets.get_dataset(DATASET_NAME)
queries = dataset.get_topics(variant='round4')
qrels = dataset.get_qrels(variant='round4')

In [7]:
print(queries)
print(qrels)

   qid                                              query
0    1  coronavirus origin what is the origin of covid...
1    2  coronavirus response to weather changes how do...
2    3  coronavirus immunity will sars cov2 infected p...
3    4  how do people die from the coronavirus what ca...
4    5  animal models of covid 19 what drugs have been...
5    6  coronavirus test rapid testing what types of r...
6    7  serological tests for coronavirus are there se...
7    8  coronavirus under reporting how has lack of te...
8    9  coronavirus in canada how has covid 19 affecte...
9   10  coronavirus social distancing impact has socia...
10  11  coronavirus hospital rationing what are the gu...
11  12  coronavirus quarantine what are best practices...
12  13  how does coronavirus spread what are the trans...
13  14  coronavirus super spreaders what evidence is t...
14  15  coronavirus outside body how long can the coro...
15  16  how long does coronavirus survive on surfaces ...
16  17  corona

In [8]:
#index = pt.IndexFactory.of(str(Path.cwd() / "indices" / "webis_index"))
index = dataset.get_index(variant="terrier_stemmed")
bm25 = pt.terrier.Retriever(index, wmodel="BM25")



In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
import re

storage = {}

def generate_expansion_terms(query, instructions):
    expansion_terms_list = []
    for instruction in instructions:

        if query + str(instruction) in storage:
            expansion_terms = storage[query + str(instruction)]
            expansion_terms_list += expansion_terms
        else:
            prompt = f"{instruction}. Provide a comma-separated list of only expansion terms (single words) relevant to the query: {query}"
            inputs = tokenizer(prompt, return_tensors="pt")
            outputs = model.generate(**inputs,
                                     do_sample=True,
                                     top_p=0.92,
                                     top_k=200,
                                     repetition_penalty=1.2,
                                     temperature=1.0,
                                     max_new_tokens=50  # Reduce token limit to prevent full sentences
                                     )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Post-process output: split by commas, strip spaces, remove problematic characters
            expansion_terms = [term.strip().replace("?", "").replace('"', '').replace("'", "")
                               for term in generated_text.split(",") if term.strip()]
            
            storage[query + str(instruction)] = expansion_terms
            expansion_terms_list += expansion_terms

    #print(re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list)))
    return re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list))

In [8]:
def write_to_file(title, experiment_results, filename):
    with open(f"{filename}.txt", "a") as file:
        file.write(title + '\n')
        
        map = experiment_results["AP"].iloc[0]
        mrr = experiment_results["RR"].iloc[0]
        ndcg = experiment_results["nDCG@10"].iloc[0]
        
        file.write(f'nDCG@10: {ndcg}\n')
        file.write(f'MAP: {map}\n')
        file.write(f'MRR: {mrr}\n')

In [9]:
import itertools
import pandas as pd
from pyterrier.measures import MRR, nDCG, MAP

def evaluate_combinations(paraphrases):
    """Evaluate retrieval performance for all combinations of paraphrases."""

    # Generate all subsets of paraphrases
    all_paraphrase_subsets = [subset for L in range(1, len(paraphrases) + 1) for subset in itertools.combinations(paraphrases, L)]

    i = 0
    for paraphrase_subset in all_paraphrase_subsets:
        #subset_name = " | ".join(paraphrase_subset)  # For display
        #print(f"\n🔹 Evaluating Combination: {subset_name}")

        #add all reformed queries in a list with their qid
        all_reformed_queries_for_subset = []
        
        for _, row in queries.iterrows():
            query = row["query"]
            qid = row["qid"]

            # Generate reformulated queries using the current subset of paraphrases
            reformed_query = generate_expansion_terms(query, paraphrase_subset)
            all_reformed_queries_for_subset.append({"qid": qid, "query": reformed_query})

        queries_df = pd.DataFrame(all_reformed_queries_for_subset)
        subset_evaluation = pt.Experiment(
            [bm25],
            queries_df,
            qrels,
            eval_metrics=[nDCG@10, MAP, MRR]
        )
        
        write_to_file(f'Subset {i}', subset_evaluation, 'results_trec_covid')
        
        print(f"\nSubset {i}")
        print(subset_evaluation)
        i += 1

In [10]:
# Manually input your paraphrased instructions here
paraphrases = [
    "Improve the search effectiveness by suggesting expansion terms for the query",
    "Recommend expansion terms for the query to improve search results",
    "Improve the search effectiveness by suggesting useful expansion terms for the query",
    "Maximize search utility by suggesting relevant expansion phrases for the query",
    "Enhance search efficiency by proposing valuable terms to expand the query",
    "Elevate search performance by recommending relevant expansion phrases for the query",
    "Boost the search accuracy by providing helpful expansion terms to enrich the query",
    "Increase the search efficacy by offering beneficial expansion keywords for the query",
    "Optimize search results by suggesting meaningful expansion terms to enhance the query",
    "Enhance search outcomes by recommending beneficial expansion terms to supplement the query"
]

In [11]:
evaluate_combinations(paraphrases)


Subset 0
                name  nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.24283  0.120394  0.453432

Subset 1
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.265432  0.124798  0.470772

Subset 2
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.266056  0.125051  0.489842

Subset 3
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.249523  0.119131  0.475298

Subset 4
                name  nDCG@10        AP        RR
0  TerrierRetr(BM25)   0.2697  0.122864  0.507224

Subset 5
                name  nDCG@10        AP       RR
0  TerrierRetr(BM25)  0.24636  0.121128  0.44644

Subset 6
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.238524  0.123804  0.436492

Subset 7
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.260708  0.124354  0.481272

Subset 8
                name  nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.26432  0.122186  0.526689

S