In [8]:
!pip install python-terrier
!pip install transformers

import torch
import pyterrier as pt
import requests
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer
from pathlib import Path



In [10]:
pt.init()

RuntimeError: You can only call <function legacy_init at 0x0000026E302E6980> before either you start using java or call pt.java.init()

In [11]:
from pathlib import Path

# Load BM25 Index using PyTerrier
DATASET_NAME = "irds:beir/webis-touche2020"

# Load Dataset
dataset = pt.datasets.get_dataset(DATASET_NAME)
queries = dataset.get_topics()
qrels = dataset.get_qrels()

There are multiple query fields available: ('text', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [12]:
# index = pt.index.IterDictIndexer(
#     str(Path.cwd()),
#     meta={
#         "docno": 39,
#         "text": 131072,
#     },
#     type=pt.index.IndexingType.MEMORY,
# ).index(dataset.get_corpus_iter())

index = pt.IndexFactory.of(str(Path.cwd() / "indices" / "webis_index"))
bm25 = pt.BatchRetrieve(index, wmodel="BM25")


  bm25 = pt.BatchRetrieve(index, wmodel="BM25")


In [29]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [93]:
import re

storage = {}

def generate_expansion_terms(query, instructions):
    expansion_terms_list = []
    for instruction in instructions:

        if query + str(instruction) in storage:
            expansion_terms = storage[query + str(instruction)]
            expansion_terms_list += expansion_terms
        else:
            prompt = f"{instruction}. Provide a comma-separated list of only expansion terms (single words) relevant to the query: {query}"
            inputs = tokenizer(prompt, return_tensors="pt")
            outputs = model.generate(**inputs,
                                     do_sample=True,
                                     top_p=0.92,
                                     top_k=200,
                                     repetition_penalty=1.2,
                                     temperature=1.0,
                                     max_new_tokens=50  # Reduce token limit to prevent full sentences
                                     )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Post-process output: split by commas, strip spaces, remove problematic characters
            expansion_terms = [term.strip().replace("?", "").replace('"', '').replace("'", "")
                               for term in generated_text.split(",") if term.strip()]
            storage[query + str(instruction)] = expansion_terms
            expansion_terms_list += expansion_terms

    print(re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list)))
    return re.sub(r"[^a-zA-Z\s]", "", query + " " + " ".join(expansion_terms_list))

In [94]:
# Step 2: Retrieve Documents with BM25
def retrieve_documents(query, k=10):
    """Retrieve top-K documents using BM25 in PyTerrier."""
    results = bm25.search(query, k)
    return results[["docno", "score"]].values.tolist()

In [116]:
import itertools
import pandas as pd

def evaluate_combinations(paraphrases, k=10):
    """Evaluate retrieval performance for all combinations of paraphrases."""

    results_list = []

    # Generate all subsets of paraphrases
    all_paraphrase_subsets = [subset for L in range(1, len(paraphrases) + 1) for subset in itertools.combinations(paraphrases, L)]

    i = 0
    for subset in all_paraphrase_subsets:
        subset_name = " | ".join(subset)  # For display
        print(f"\n🔹 Evaluating Combination: {subset_name}")
        print(f"\n Subset number {i}")
        i += 1

        # for _, row in queries.iterrows():
        for query in ["Is golf a sport?"]:
            # query = row["text"]
            # qid = row["qid"]
            qid = 1

            # Generate reformulated queries using the current subset of paraphrases
            reformed_query = generate_expansion_terms(query, subset)

            results = (retrieve_documents(reformed_query, k))

            # Aggregate scores
            aggregated_results = {}
            for docid, score in results:
                aggregated_results[docid] = aggregated_results.get(docid, 0) + score

            sorted_results = sorted(aggregated_results.items(), key=lambda x: x[1], reverse=True)[:k]

            # Store results for evaluation
            for rank, (docid, score) in enumerate(sorted_results):
                results_list.append({"qid": qid, "docno": docid, "rank": rank + 1, "score": score, "subset": subset_name})

    # Convert results to DataFrame
    results_df = pd.DataFrame(results_list)

    # Compute evaluation metrics for each subset
    print("\n📊 Evaluation Results Per Combination:")
    for subset in all_paraphrase_subsets:
        subset_name = " | ".join(subset)
        subset_df = results_df.loc[results_df["subset"] == subset_name].copy()
        eval_metrics = pt.Evaluate(subset_df, qrels, metrics=["map", "recip_rank", "ndcg_cut_10"])
        print(f"\n🔹 Results for '{subset_name}': {eval_metrics}")

In [106]:
# Manually input your paraphrased instructions here
paraphrases = [
    "Improve the search effectiveness by suggesting expansion terms for the query",
    "Recommend expansion terms for the query to improve search results",
    "Improve the search effectiveness by suggesting useful expansion terms for the query",
    "Maximize search utility by suggesting relevant expansion phrases for the query",
    "Enhance search efficiency by proposing valuable terms to expand the query",
    "Elevate search performance by recommending relevant expansion phrases for the query",
    "Boost the search accuracy by providing helpful expansion terms to enrich the query",
    "Increase the search efficacy by offering beneficial expansion keywords for the query",
    "Optimize search results by suggesting meaningful expansion terms to enhance the query",
    "Enhance search outcomes by recommending beneficial expansion terms to supplement the query"
]

In [117]:
evaluate_combinations(paraphrases)


🔹 Evaluating Combination: Improve the search effectiveness by suggesting expansion terms for the query

 Subset number 0
Is golf a sport Is golf a sport

🔹 Evaluating Combination: Recommend expansion terms for the query to improve search results

 Subset number 1
Is golf a sport Golf is a sport

🔹 Evaluating Combination: Improve the search effectiveness by suggesting useful expansion terms for the query

 Subset number 2
Is golf a sport golf sport

🔹 Evaluating Combination: Maximize search utility by suggesting relevant expansion phrases for the query

 Subset number 3
Is golf a sport Is golf a sport

🔹 Evaluating Combination: Enhance search efficiency by proposing valuable terms to expand the query

 Subset number 4
Is golf a sport List of governing body for golf

🔹 Evaluating Combination: Elevate search performance by recommending relevant expansion phrases for the query

 Subset number 5
Is golf a sport Is golf a sport

🔹 Evaluating Combination: Boost the search accuracy by providi