In [1]:
!pip install python-terrier
!pip install transformers

import pyterrier as pt
from pathlib import Path



In [2]:
pt.init()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [3]:
from pathlib import Path

# Load BM25 Index using PyTerrier
DATASET_NAME = "irds:msmarco-passage/trec-dl-2019"

# Load Dataset
dataset = pt.datasets.get_dataset(DATASET_NAME)
queries = dataset.get_topics()
qrels = dataset.get_qrels()

[INFO] Please confirm you agree to the MSMARCO data usage agreement found at <http://www.msmarco.org/dataset.aspx>
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz
[INFO] [finished] https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz: [00:00] [4.28kB] [?B/s]


PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\Nick Ouwerkerk\\.ir_datasets\\msmarco-passage\\trec-dl-2019\\queries.tsv.tmp5'

In [7]:
print(queries)
print(qrels)

   qid                                              query
0    1                         should teachers get tenure
1    2                   is vaping with e cigarettes safe
2    3                  should insider trading be allowed
3    4      should corporal punishment be used in schools
4    5               should social security be privatized
5    6                    is a college education worth it
6    7  should felons who have completed their sentenc...
7    8                           should abortion be legal
8    9       should students have to wear school uniforms
9   10       should any vaccines be required for children
10  11  should performance enhancing drugs be accepted...
11  12  should birth control pills be available over t...
12  13  can alternative energy effectively replace fos...
13  14          is sexual orientation determined at birth
14  15  should animals be used for scientific or comme...
15  16  should prescription drugs be advertised direct...
16  17        

In [None]:
index = pt.IndexFactory.of(str(Path.cwd() / "indices" / "msmarco_index"))
bm25 = pt.terrier.Retriever(index, wmodel="BM25")

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
import re

storage = {}

def generate_expansion_terms(query, instructions):
    expansion_terms_list = []
    for instruction in instructions:

        if query + str(instruction) in storage:
            expansion_terms = storage[query + str(instruction)]
            expansion_terms_list += expansion_terms
        else:
            prompt = f"{instruction}. Provide a comma-separated list of only expansion terms (single words) relevant to the query: {query}"
            inputs = tokenizer(prompt, return_tensors="pt")
            outputs = model.generate(**inputs,
                                     do_sample=True,
                                     top_p=0.92,
                                     top_k=200,
                                     repetition_penalty=1.2,
                                     temperature=1.0,
                                     max_new_tokens=50  # Reduce token limit to prevent full sentences
                                     )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Post-process output: split by commas, strip spaces, remove problematic characters
            expansion_terms = [term.strip().replace("?", "").replace('"', '').replace("'", "")
                               for term in generated_text.split(",") if term.strip()]
            
            storage[query + str(instruction)] = expansion_terms
            expansion_terms_list += expansion_terms

    #print(re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list)))
    return re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list))

In [None]:
def write_to_file(title, experiment_results, filename):
    with open(f"{filename}.txt", "a") as file:
        file.write(title + '\n')
        
        map = experiment_results["AP"].iloc[0]
        mrr = experiment_results["RR"].iloc[0]
        ndcg = experiment_results["nDCG@10"].iloc[0]
        
        file.write(f'nDCG@10: {ndcg}\n')
        file.write(f'MAP: {map}\n')
        file.write(f'MRR: {mrr}\n')

In [None]:
import itertools
import pandas as pd
from pyterrier.measures import MRR, nDCG, MAP

def evaluate_combinations(paraphrases):
    """Evaluate retrieval performance for all combinations of paraphrases."""

    # Generate all subsets of paraphrases
    all_paraphrase_subsets = [subset for L in range(1, len(paraphrases) + 1) for subset in itertools.combinations(paraphrases, L)]

    i = 0
    for paraphrase_subset in all_paraphrase_subsets:
        #subset_name = " | ".join(paraphrase_subset)  # For display
        #print(f"\n🔹 Evaluating Combination: {subset_name}")

        #add all reformed queries in a list with their qid
        all_reformed_queries_for_subset = []
        
        for _, row in queries.iterrows():
            query = row["query"]
            qid = row["qid"]

            # Generate reformulated queries using the current subset of paraphrases
            reformed_query = generate_expansion_terms(query, paraphrase_subset)
            all_reformed_queries_for_subset.append({"qid": qid, "query": reformed_query})

        queries_df = pd.DataFrame(all_reformed_queries_for_subset)
        subset_evaluation = pt.Experiment(
            [bm25],
            queries_df,
            qrels,
            eval_metrics=[nDCG@10(rel=2), MAP(rel=2), MRR(rel=2)]
        )
        
        write_to_file(f'Subset {i}', subset_evaluation, 'results_msmarco')
        
        print(f"\nSubset {i}")
        print(subset_evaluation)
        i += 1

In [12]:
# Manually input your paraphrased instructions here
paraphrases = [
    "Improve the search effectiveness by suggesting expansion terms for the query",
    "Recommend expansion terms for the query to improve search results",
    "Improve the search effectiveness by suggesting useful expansion terms for the query",
    "Maximize search utility by suggesting relevant expansion phrases for the query",
    "Enhance search efficiency by proposing valuable terms to expand the query",
    "Elevate search performance by recommending relevant expansion phrases for the query",
    "Boost the search accuracy by providing helpful expansion terms to enrich the query",
    "Increase the search efficacy by offering beneficial expansion keywords for the query",
    "Optimize search results by suggesting meaningful expansion terms to enhance the query",
    "Enhance search outcomes by recommending beneficial expansion terms to supplement the query"
]

In [None]:
evaluate_combinations(paraphrases)

baseline
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.636044  0.309622  0.959184

Subset 0
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.558978  0.282868  0.880057

Subset 1
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.612914  0.295566  0.959184

Subset 2
                name  nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.56958  0.275193  0.905649

Subset 3
                name   nDCG@10        AP       RR
0  TerrierRetr(BM25)  0.590145  0.289799  0.94898

Subset 4
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.578107  0.288156  0.945578

Subset 5
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.572174  0.281288  0.909945

Subset 6
                name  nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.59248  0.284694  0.899093

Subset 7
                name   nDCG@10        AP        RR
0  TerrierRetr(BM25)  0.590668  0.290125  0.927721