In [1]:
!pip install python-terrier
!pip install transformers

import torch
import pyterrier as pt
import requests
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pathlib import Path

Collecting python-terrier
  Downloading python_terrier-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting ir-datasets>=0.3.2 (from python-terrier)
  Downloading ir_datasets-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting ir-measures>=0.3.1 (from python-terrier)
  Downloading ir_measures-0.3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting pytrec-eval-terrier>=0.5.3 (from python-terrier)
  Downloading pytrec_eval_terrier-0.5.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (777 bytes)
Collecting dill (from python-terrier)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting chest (from python-terrier)
  Downloading chest-0.2.3.tar.gz (9.6 kB)
  Preparing metadata (setup.py

In [2]:
pt.init()

terrier-assemblies 5.11 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [3]:
from pathlib import Path

# Load BM25 Index using PyTerrier
dataset = pt.datasets.get_dataset("irds:antique/test/non-offensive")
index = pt.index.IterDictIndexer(
    str(Path.cwd()),
    meta={
        "docno": 32,
        "text": 131072,
    },
    type=pt.index.IndexingType.MEMORY,
).index(dataset.get_corpus_iter())
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

antique/test/non-offensive documents:   0%|          | 0/403666 [00:00<?, ?it/s]

[INFO] Please confirm you agree to the authors' data usage agreement found at <https://ciir.cs.umass.edu/downloads/Antique/readme.txt>
[INFO] If you have a local copy of https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/684f7015aff377062a758e478476aac8
[INFO] [starting] https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt

https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.0%| 0.00/93.6M [00:00<?, ?B/s][A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.0%| 8.19k/93.6M [00:00<41:53, 37.2kB/s][A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.0%| 41.0k/93.6M [00:00<16:53, 92.3kB/s][A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.1%| 98.3k/93.6M [00:00<10:35, 147kB/s] [A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.2%| 205k/93.6M [00:00<06:46, 230kB/s] [A
http

In [4]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:

# def generate_reformulated_query(query, instructions):
#     reformulated_queries = []
#     for instruction in instructions:
#         prompt = f"{instruction}: {query}"
#         inputs = tokenizer(prompt, return_tensors="pt")
#         outputs = model.generate(**inputs, max_length=64, num_return_sequences=1)
#         reformulated_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         reformulated_queries.append(reformulated_query)
#     return reformulated_queries

def generate_reformulated_query(query, instructions):
    reformulated_queries = []
    for instruction in instructions:
        prompt = f"{instruction}: {query}"
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_length=64, num_return_sequences=1)
        reformulated_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # *** Add this line to remove any potentially problematic colons: ***
        reformulated_query = reformulated_query.replace("?", "").replace('"', '').replace("'", "")
        print(reformulated_query)

        reformulated_queries.append(reformulated_query)
    return reformulated_queries

In [6]:
# Step 2: Retrieve Documents with BM25
def retrieve_documents(query, k=10):
    """Retrieve top-K documents using BM25 in PyTerrier."""
    results = bm25.search(query, k)
    return results[["docno", "score"]].values.tolist()

In [17]:
# Step 4: Ensemble Query Reformulations and Retrieve Documents
def ensemble_retrieve(query, instructions, k=10):
    """Combine multiple reformulated queries for retrieval."""
    reformulated_queries = generate_reformulated_query(query, instructions)
    print("Generated Queries:", reformulated_queries)
    all_results = []
    for reformulated_query in reformulated_queries:
        results = retrieve_documents(reformulated_query, k)
        # print("Retrieved Documents:", results)
        all_results.extend(results)

    aggregated_results = {}
    for docid, score in all_results:
        aggregated_results[docid] = aggregated_results.get(docid, 0) + score

    sorted_results = sorted(aggregated_results.items(), key=lambda x: x[1], reverse=True)
    return sorted_results[:k]

In [13]:
# Step 5: Ablation Study
def ablation_study(query, instructions, k=10):
    """Perform ablation by removing one instruction at a time and measuring impact."""
    full_results = ensemble_retrieve(query, instructions, k)
    full_score = sum(score for _, score in full_results)

    ablation_results = {}
    for i, removed_instruction in enumerate(instructions):
        reduced_instructions = [instr for j, instr in enumerate(instructions) if j != i]
        reduced_results = ensemble_retrieve(query, reduced_instructions, k)
        reduced_score = sum(score for _, score in reduced_results)

        ablation_results[removed_instruction] = full_score - reduced_score

    return ablation_results

def single_instruction_test(query, instructions, k=10):
    """Test each instruction separately and measure its individual impact."""
    instruction_scores = {}

    for instruction in instructions:
        print(f"Testing instruction: {instruction}")
        results = ensemble_retrieve(query, [instruction], k)
        total_score = sum(score for _, score in results)
        instruction_scores[instruction] = total_score

    return instruction_scores


In [11]:
# Example Usage
query = "i want to know how does quantum mechanics work please help i need it please thanks"

# Manually input your paraphrased instructions here
paraphrases = [
    "Improve the search effectiveness by suggesting expansion terms for the query",
    "Recommend expansion terms for the query to improve search results",
    "Improve the search effectiveness by suggesting useful expansion terms for the query",
    "Maximize search utility by suggesting relevant expansion phrases for the query",
    "Enhance search efficiency by proposing valuable terms to expand the query",
    "Elevate search performance by recommending relevant expansion phrases for the query",
    "Boost the search accuracy by providing helpful expansion terms to enrich the query",
    "Increase the search efficacy by offering beneficial expansion keywords for the query",
    "Optimize search results by suggesting meaningful expansion terms to enhance the query",
    "Enhance search outcomes by recommending beneficial expansion terms to supplement the query"
]

In [19]:
# Run the ablation study
# ablation_results = ablation_study(query, paraphrases, k=10)
test = single_instruction_test(query, paraphrases, k=10)
# print("Ablation Study Results:", ablation_results)
# print("Single Instruction Test Results:", test)
for score in test:
    print(score, test[score])

Testing instruction: Improve the search effectiveness by suggesting expansion terms for the query
how does quantum mechanics work
Generated Queries: ['how does quantum mechanics work']
Testing instruction: Recommend expansion terms for the query to improve search results
i want to know how does quantum mechanics work please help i need it please thanks
Generated Queries: ['i want to know how does quantum mechanics work please help i need it please thanks']
Testing instruction: Improve the search effectiveness by suggesting useful expansion terms for the query
how does quantum mechanics work
Generated Queries: ['how does quantum mechanics work']
Testing instruction: Maximize search utility by suggesting relevant expansion phrases for the query
how does quantum mechanics work
Generated Queries: ['how does quantum mechanics work']
Testing instruction: Enhance search efficiency by proposing valuable terms to expand the query
Quantum mechanics is the study of the properties of matter and th