# MIRACL Dataset
This Notebook demonstrates the setup of the MIRACL dataset for use in _PyTerrier_. The dataset is available on Huggingface and comprises two parts:

1. **[miracl/miracl-corpus](https://huggingface.co/datasets/miracl/miracl-corpus)**: Contains the _corpus_ data.

2. **[miracl/miracl](https://huggingface.co/datasets/miracl/miracl)**: Contains the _topics_ and _qrels_.


In [1]:
# Dependencies
# %pip install python-terrier   # PyTerrier
# %pip install datasets         # Hugging Face
# %pip install tqdm             # tqdm progress bars
# %pip install pandas           # pandas

# somehow necessary on Mac, not sure about other OS
# %pip install --upgrade jupyter ipywidgets

In [2]:
# Libraries
import pyterrier as pt
import datasets
import pandas as pd
from tqdm import tqdm

In [3]:
# Initialize PyTerrier
if not pt.started():
    pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
# Load miracl datasets
lang='sw'   # choose language
miracl_corpus = datasets.load_dataset('miracl/miracl-corpus', lang, trust_remote_code=True) # splits: train
miracl_queries = datasets.load_dataset('miracl/miracl', lang, trust_remote_code=True)       # splits: train, dev, testA, testB

In [5]:
# Limit amount of docs and queries (for testing with large datasets)
# def limited_miracl_corpus_iter(limit=1000):
#     count = 0
#     for doc in tqdm(miracl_corpus['train'], desc="Processing Corpus"):
#         if count >= limit:
#             break
#         yield {
#             'docno': doc['docid'], 
#             'title': doc['title'],
#             'text': doc['text']
#         }
#         count += 1

# Preparing queries and qrels for PyTerrier
# queries = []
# qrels = []
# for idx, data in enumerate(tqdm(miracl_queries[split], desc="Processing Queries and Qrels")):
#     if idx >= 10:  # Limit to first 10 queries
#         break
#     queries.append({'qid': data['query_id'], 'query': data['query']})
#     for entry in data['positive_passages']:
#         qrels.append({'qid': data['query_id'], 'docno': entry['docid'], 'label': 1})
#     for entry in data['negative_passages']:
#         qrels.append({'qid': data['query_id'], 'docno': entry['docid'], 'label': 0})


In [6]:
# Choose split for queries dataset
split = 'train'   # 'dev', 'train', 'testA', 'testB'

# Corpus iterator
def miracl_corpus_iter():
    for doc in tqdm(miracl_corpus['train'], desc="Processing Corpus"):
        yield {
            'docno': doc['docid'], 
            'title': doc['title'],
            'text': doc['text']
        }

# Preparing queries and qrels for PyTerrier
queries = []
qrels = []
for data in tqdm(miracl_queries[split], desc="Processing Queries and Qrels"):
    queries.append({'qid': data['query_id'], 'query': data['query']})
    for entry in data['positive_passages']:
        qrels.append({'qid': data['query_id'], 'docno': entry['docid'], 'label': 1})
    for entry in data['negative_passages']:
        qrels.append({'qid': data['query_id'], 'docno': entry['docid'], 'label': 0})


queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

Processing Queries and Qrels: 100%|██████████| 1901/1901 [00:00<00:00, 22699.86it/s]


In [7]:
# Indexing
indexer = pt.IterDictIndexer("./miracl_index", overwrite=True, blocks=True)
index_ref = indexer.index(miracl_corpus_iter())

Processing Corpus:   0%|          | 101/131924 [00:00<06:34, 334.31it/s]



Processing Corpus: 100%|██████████| 131924/131924 [00:05<00:00, 23669.25it/s]


14:47:30.068 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer - Indexed 129 empty documents


In [8]:
# BM25 Single Query Testing/Debugging
# bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")
# test_query_df = pd.DataFrame([{'qid': 'test', 'query': "Testx"}])
# test_query_df['query'] = test_query_df['query'].str.replace("x", "omato")
# print(test_query_df)
# test_results = bm25.transform(test_query_df)

In [10]:
# BM25
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")

# Preprocessing
queries_df['query'] = queries_df['query'].str.replace('?', '')  # remove question marks
queries_df['query'] = queries_df['query'].str.replace("'", "")  # remove apostrophes
queries_df['query'] = queries_df['query'].str.replace("/", "")  # remove slash
queries_df['query'] = queries_df['query'].str.replace("!", "")  # remove exclamation mark
# add further replace statements should you encounter any "Lexical Error"

# Apply BM25 to preprocessed queries
results = bm25.transform(queries_df)

In [11]:
# Evaluation
eval_metrics = ['map', 'ndcg']
eval_results = pt.Experiment(
    [bm25],
    queries_df,
    qrels_df,
    eval_metrics=eval_metrics,
    #perquery=True
)

print(eval_results)

       name       map      ndcg
0  BR(BM25)  0.211017  0.325033
