In [1]:
import pyterrier as pt
from pathlib import Path

In [None]:
!export JAVA_HOME=/opt/homebrew/Cellar/openjdk/23.0.2/libexec/openjdk.jdk/Contents/Home
!export PATH=$JAVA_HOME/bin:$PATH
pt.java.set_java_home("/opt/homebrew/Cellar/openjdk/23.0.2/libexec/openjdk.jdk/Contents/Home")

In [None]:
pt.init()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [None]:
from pathlib import Path

# Load BM25 Index using PyTerrier
DATASET_NAME = "irds:msmarco-passage/trec-dl-2019"

# Load Dataset
dataset = pt.datasets.get_dataset(DATASET_NAME)
queries = dataset.get_topics()
qrels = dataset.get_qrels()

In [None]:
print(queries)
print(qrels)

         qid                                              query
0    1108939                  what slows down the flow of blood
1    1112389             what is the county for grand rapids mn
2     792752                                     what is ruclip
3    1119729  what do you do when you have a nosebleed from ...
4    1105095                  where is sugar lake lodge located
..       ...                                                ...
195   146187  difference between a mcdouble and a double che...
196   634428                            what does chs stand for
197  1121986     what are the effects of having low blood sugar
198   321441                 how much is a us postal stamp cost
199   532603                   university of dubuque enrollment

[200 rows x 2 columns]
          qid    docno  label iteration
0       19335  1017759      0        Q0
1       19335  1082489      0        Q0
2       19335   109063      0        Q0
3       19335  1160863      0        Q0
4       

In [None]:
index = pt.IndexFactory.of(str(Path.cwd() / "indices" / "msmarco_index"))
bm25 = pt.terrier.Retriever(index, wmodel="BM25")

16:05:00.279 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2 GiB of memory would be required.


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Set up device for MPS if available, otherwise CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(device)

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
model.to(device)  # Move the model to MPS or CPU

  from .autonotebook import tqdm as notebook_tqdm


mps


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [None]:
import re

storage = {}

def generate_expansion_terms(query, instructions):
    expansion_terms_list = []
    for instruction in instructions:

        if query + str(instruction) in storage:
            expansion_terms = storage[query + str(instruction)]
            expansion_terms_list += expansion_terms
        else:
            prompt = f"{instruction}. Provide a comma-separated list of only expansion terms (single words) relevant to the query: {query}"
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            outputs = model.generate(**inputs,
                                     do_sample=True,
                                     top_p=0.92,
                                     top_k=200,
                                     repetition_penalty=1.2,
                                     temperature=1.0,
                                     max_new_tokens=50  # Reduce token limit to prevent full sentences
                                     )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Post-process output: split by commas, strip spaces, remove problematic characters
            expansion_terms = [term.strip().replace("?", "").replace('"', '').replace("'", "")
                               for term in generated_text.split(",") if term.strip()]
            
            storage[query + str(instruction)] = expansion_terms
            expansion_terms_list += expansion_terms

    #print(re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list)))
    return re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list))

In [None]:
def write_to_file(title, experiment_results, filename):
    with open(f"{filename}.txt", "a") as file:
        file.write(title + '\n')
        
        map = experiment_results["AP(rel=2)"].iloc[0]
        mrr = experiment_results["RR(rel=2)"].iloc[0]
        ndcg = experiment_results["nDCG@10"].iloc[0]
        
        file.write(f'nDCG@10: {ndcg}\n')
        file.write(f'MAP: {map}\n')
        file.write(f'MRR: {mrr}\n')

In [None]:
import itertools
import pandas as pd
from pyterrier.measures import MRR, nDCG, MAP
from pyterrier.measures import *
from tqdm import tqdm

def evaluate_combinations(paraphrases):
    """Evaluate retrieval performance for all combinations of paraphrases."""
    
    # Generate all subsets of paraphrases
    all_paraphrase_subsets = [
        subset for L in range(1, len(paraphrases) + 1)
        for subset in itertools.combinations(paraphrases, L)
    ]
    
    # Use tqdm to track progress of evaluating each subset
    for i, paraphrase_subset in enumerate(tqdm(all_paraphrase_subsets, desc="Evaluating paraphrase combinations")):
        # Optionally, for display purposes:
        # subset_name = " | ".join(paraphrase_subset)
        # print(f"\n🔹 Evaluating Combination: {subset_name}")

        # Collect all reformed queries for the current subset
        all_reformed_queries_for_subset = []
        for _, row in queries.iterrows():
            query = row["query"]
            qid = row["qid"]

            # Generate reformulated queries using the current subset of paraphrases
            reformed_query = generate_expansion_terms(query, paraphrase_subset)
            all_reformed_queries_for_subset.append({"qid": qid, "query": reformed_query})

        queries_df = pd.DataFrame(all_reformed_queries_for_subset)
        subset_evaluation = pt.Experiment(
            [bm25],
            queries_df,
            qrels,
            eval_metrics=[nDCG@10, MAP(rel=2), MRR(rel=2)]
        )
        
        print(subset_evaluation)

        write_to_file(f'Subset {i}', subset_evaluation, 'results_msmarco')
        
        print(f"\nSubset {i}")
        print(subset_evaluation)

In [None]:
# Manually input your paraphrased instructions here
paraphrases = [
    "Improve the search effectiveness by suggesting expansion terms for the query",
    "Recommend expansion terms for the query to improve search results",
    "Improve the search effectiveness by suggesting useful expansion terms for the query",
    "Maximize search utility by suggesting relevant expansion phrases for the query",
    "Enhance search efficiency by proposing valuable terms to expand the query",
    "Elevate search performance by recommending relevant expansion phrases for the query",
    "Boost the search accuracy by providing helpful expansion terms to enrich the query",
    "Increase the search efficacy by offering beneficial expansion keywords for the query",
    "Optimize search results by suggesting meaningful expansion terms to enhance the query",
    "Enhance search outcomes by recommending beneficial expansion terms to supplement the query"
]

In [None]:
evaluate_combinations(paraphrases)

Evaluating paraphrase combinations:   0%|          | 0/1023 [01:33<?, ?it/s]

                name   nDCG@10  AP(rel=2)  RR(rel=2)
0  TerrierRetr(BM25)  0.478083   0.268203   0.662612





KeyError: 'MAP'