# Information Retrieval Research Project Experiment

Group 14


Install all required dependencies

In [1]:
!pip install python-terrier==0.10.0
!pip install transformers
!pip install tokenizers
!pip install torch
!pip install nltk



Setup PyTerrier and other dependencies.

In [2]:
import datetime
import pyterrier as pt
from pyterrier.measures import RR, nDCG, MAP, P, R
import pandas as pd
import torch
import string
from transformers import BertTokenizer, BertForMaskedLM
from pathlib import Path

if not pt.started():
    pt.init(
        tqdm="notebook", boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
    )

# Download NLTK stopwords
import nltk
nltk.download('stopwords')

# Define your BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Stopwords and punctuation set
stopwords_set = set(nltk.corpus.stopwords.words('english')).union(set(['##s']))
punctuation = set(string.punctuation)

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/martijn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequence

In [3]:
# Load VASWANI dataset
vaswani = pt.get_dataset("vaswani")
vaswani_index = vaswani.get_index(variant="terrier_stemmed")

In [4]:
# Load ANTIQUE dataset
antique = pt.get_dataset("antique")
antique_index_path = Path("antique_index").absolute()

if not (antique_index_path / "data.properties").is_file():
    antique_index = pt.index.IterDictIndexer(str(antique_index_path), meta={"docno": 32, "text": 131072},).index(antique.get_corpus_iter())
else:
    antique_index = pt.IndexRef.of(str(antique_index_path / "data.properties"))

In [5]:
# Load MSMARCO documents v2 DL 2021 dataset
# WARNING: This downloads several GB's of data!
msmarco_v2 = pt.get_dataset("irds:msmarco-document-v2/trec-dl-2021") # load qrels and the rest
msmarco_v2_index = pt.get_dataset('msmarcov2_document').get_index('terrier_stemmed') # load index

Our custom BERT-based contextual re-write functions

In [6]:
def bert_top_k(query: str, k):
    """
      Rewrite the query at once
    """
    # Tokenize the query. Adding [MASK] [PAD] seemed to give the best results.
    tokenized_query = tokenizer(query + "[MASK] [PAD]", return_tensors="pt", padding=True, truncation=True)
    input_ids = tokenized_query["input_ids"]

    # Find the position of the last token (excluding padding)
    last_token_position = len(input_ids[0]) - 1
    while input_ids[0][last_token_position] == tokenizer.pad_token_id or input_ids[0][
        last_token_position] == tokenizer.sep_token_id:
        last_token_position -= 1

    # Insert the [MASK] token at the position of the last token
    input_ids[0][last_token_position] = tokenizer.mask_token_id

    # Get the logits for the masked token
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs.logits[0, last_token_position]

    # Get the top k predicted tokens
    top_k_tokens = torch.topk(predictions, k + 6, dim=-1).indices.tolist()

    # Decode the tokens to get the expanded query
    expanded_query = query
    i = 0
    for token_id in top_k_tokens:
        if i >= k:
            break
        token = tokenizer.convert_ids_to_tokens(token_id)
        if (token.lower() in stopwords_set or token in punctuation or token in query):
            continue

        # Filter out problematic tokens which cause parser errors in the experiment
        if '#' in token:
            continue

        expanded_query = expanded_query + " " + token
        i += 1

    return expanded_query


def bert_top_k_incremental(query, k):
    """
      Incremental top-k rewriting
    """
    if k > 1:
        return bert_top_k_incremental(bert_top_k(query, 1), k - 1)
    elif k == 1:
        return bert_top_k(query, 1)
    else:
        return query

In [26]:
def bert_top_5(query: str):
  return bert_top_k(query, 5)

def run_experiment_on_dataset(dataset, index, variant=None, include_incremental_bert=False, parallel=False):
  """Run our complete experiment on a given dataset"""
  print(f"Starting experiment: {datetime.datetime.now()}")

  if parallel:
      bm25 = pt.BatchRetrieve(
          index,
          wmodel="BM25",
          # metadata=["docno", "text"],
          # properties={"termpipelines": ""},
          # controls={"qe": "off"}
      )
      rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25
      bert_top_1 = (pt.apply.query(lambda q: bert_top_k(q["query"], 1)) >> bm25).parallel(4)
      bert_top_5 = (pt.apply.query(lambda q: bert_top_k(q["query"], 5)) >> bm25).parallel(4)
      # bert_top_k and bert_top_k_incremental with k=1 are the same
      bert_inc_5 = (pt.apply.query(lambda q: bert_top_k_incremental(q["query"], 3)) >> bm25).parallel(4)
  else:
      bm25 = pt.BatchRetrieve(
          index,
          wmodel="BM25",
          # metadata=["docno", "text"],
          # properties={"termpipelines": ""},
          # controls={"qe": "off"}
      )
      rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25
      bert_top_1 = pt.apply.query(lambda q: bert_top_k(q["query"], 1)) >> bm25
      bert_top_5 = pt.apply.query(lambda q: bert_top_k(q["query"], 5)) >> bm25
      # bert_top_k and bert_top_k_incremental with k=1 are the same
      bert_inc_5 = pt.apply.query(lambda q: bert_top_k_incremental(q["query"], 5)) >> bm25

  if variant is None:
    topics = dataset.get_topics()
    qrels = dataset.get_qrels()
  else:
    topics = dataset.get_topics(variant=variant)
    qrels = dataset.get_qrels(variant=variant)

  if include_incremental_bert:
    experiments = [bm25, rm3, bert_top_1, bert_top_5, bert_inc_5]
    experiment_names = ["No rewriting", "RM3", "BERT k=1", "BERT k=5", "BERT k=5 (incremental)"]
  else:
    experiments = [bm25, rm3, bert_top_1, bert_top_5]
    experiment_names = ["No rewriting", "RM3", "BERT k=1", "BERT k=5"]

  exp = pt.Experiment(
    experiments,
    topics,
    qrels,
    names=experiment_names,
    eval_metrics=[nDCG @ 5, nDCG @ 10, nDCG @ 20, MAP, P @ 5, P @ 10, R @ 1000],
  )

  print(f"Experiment done at: {datetime.datetime.now()}")
  return exp


In [29]:
# Try out our experiment on the vaswani dataset
run_experiment_on_dataset(vaswani, vaswani_index, variant=None, include_incremental_bert=True, parallel=True)

Starting experiment: 2024-04-04 13:12:56.612958


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")


Experiment done at: 2024-04-04 13:13:38.238890


Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,AP,P@5,P@10,R@1000
0,No rewriting,0.510038,0.446609,0.415377,0.296517,0.460215,0.352688,0.934607
1,RM3,0.483316,0.436146,0.403416,0.293326,0.453763,0.363441,0.935456
2,BERT k=1,0.49052,0.435451,0.41073,0.286219,0.443011,0.346237,0.933239
3,BERT k=5,0.433464,0.385795,0.369906,0.254868,0.393548,0.309677,0.922747
4,BERT k=5 (incremental),0.473046,0.427693,0.404811,0.280716,0.425806,0.343011,0.931279


In [28]:
run_experiment_on_dataset(antique, antique_index, variant="test", include_incremental_bert=True, parallel=True)

Starting experiment: 2024-04-04 13:01:46.962680


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 1

Experiment done at: 2024-04-04 13:12:56.593127


Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,AP,P@5,P@10,R@1000
0,No rewriting,0.529428,0.510402,0.478976,0.451778,0.836,0.747,0.788732
1,RM3,0.509113,0.488397,0.45584,0.422948,0.824,0.7145,0.780057
2,BERT k=1,0.498607,0.479944,0.452005,0.421057,0.781,0.6965,0.782713
3,BERT k=5,0.43672,0.417257,0.394777,0.347565,0.666,0.5865,0.766317
4,BERT k=5 (incremental),0.49425,0.475563,0.448105,0.416377,0.773,0.689,0.782574


In [31]:
run_experiment_on_dataset(msmarco_v2, msmarco_v2_index, variant=None, include_incremental_bert=True, parallel=True)

Starting experiment: 2024-04-04 13:43:29.126838
13:43:31.513 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 483.7 MiB of memory would be required.
13:43:34.008 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 483.7 MiB of memory would be required.


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")


14:00:58.134 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 483.7 MiB of memory would be required.
14:00:58.188 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 483.7 MiB of memory would be required.
14:00:58.779 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 483.7 MiB of memory would be required.
14:01:00.268 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 483.7 MiB of memory would be required.
14:03:12.423 [main] WARN org.ter

Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,AP,P@5,P@10,R@1000
0,No rewriting,0.556317,0.545501,0.534505,0.345606,0.775439,0.722807,0.692405
1,RM3,0.587513,0.582401,0.575937,0.419439,0.789474,0.757895,0.748634
2,BERT k=1,0.516885,0.505544,0.499639,0.325391,0.729825,0.677193,0.683929
3,BERT k=5,0.408838,0.405863,0.400804,0.248799,0.540351,0.524561,0.626486
4,BERT k=5 (incremental),0.486904,0.473154,0.471939,0.305974,0.680702,0.629825,0.671443
