<a href="https://colab.research.google.com/github/marcospiau/ia368-dd-dl4ir/blob/main/aula05-doc2query/aula05_doc2query_indexing_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook will be used to index texts and eval results

# Installs and imports

In [None]:
!free -mh
!lscpu

              total        used        free      shared  buff/cache   available
Mem:           25Gi       589Mi        20Gi       1.0Mi       4.3Gi        24Gi
Swap:            0B          0B          0B
Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   48 bits physical, 48 bits virtual
CPU(s):                          4
On-line CPU(s) list:             0-3
Thread(s) per core:              2
Core(s) per socket:              2
Socket(s):                       1
NUMA node(s):                    1
Vendor ID:                       AuthenticAMD
CPU family:                      23
Model:                           49
Model name:                      AMD EPYC 7B12
Stepping:                        0
CPU MHz:                         2249.998
BogoMIPS:                        4499.99
Hypervisor vendor:               KVM
Virtualization type:             full
L1d cache:              

In [None]:
%%capture
!wget -nc https://raw.githubusercontent.com/marcospiau/ia368-dd-dl4ir/main/scripts/install_anserini.sh && chmod +x install_anserini.sh && time ./install_anserini.sh

In [None]:
%%capture
!pip install -q ftfy polars toolz cytoolz transformers datasets
!pip install -U t5[gcp,cache-tasks]==0.9.3
!pip install -U jaxlib
!sudo apt install -qq tree htop

# Download corpus and qrels

In [None]:
import datasets
import toolz
import multiprocessing as mp
import pandas as pd
from collections import Counter, defaultdict
import itertools

In [None]:
from transformers import T5Tokenizer

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
def concat_title_and_text(ex):
    return {'title_and_text': f"{ex['title']}. {ex['text']}"}

def encode(batch):
    lengths = tokenizer(batch['title_and_text'], return_length=True)['length']
    return {'lengths': lengths}

In [None]:
ds_corpus = datasets.load_dataset('BeIR/trec-covid', 'corpus')['corpus']
ds_corpus = ds_corpus.map(concat_title_and_text)



  0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
token_lengths = pd.Series(ds_corpus.map(encode, batched=True, num_proc=mp.cpu_count())['lengths'])



In [None]:
tokenizer(ds_corpus.select(range(1))['title_and_text'])

{'input_ids': [[14067, 753, 13, 1543, 18, 1409, 1926, 499, 509, 21178, 9, 30195, 15, 13315, 44, 2671, 28508, 9, 702, 172, 636, 4457, 6, 1022, 26, 26, 9, 107, 6, 11279, 13849, 5, 3, 10539, 683, 14196, 8087, 10, 100, 29825, 5059, 1132, 8788, 8, 29969, 1863, 11, 3739, 753, 13, 1283, 1221, 28, 1543, 18, 1409, 1926, 499, 509, 21178, 9, 30195, 15, 13315, 44, 2671, 28508, 9, 702, 172, 636, 4457, 6, 1022, 26, 26, 9, 107, 6, 11279, 13849, 5, 3, 24506, 6299, 3592, 10, 18027, 28, 1465, 283, 5, 30195, 15, 9757, 45, 19944, 19622, 7, 45, 1762, 6622, 190, 1882, 6260, 130, 4313, 190, 8, 5893, 6420, 6427, 3187, 5, 15054, 7, 13, 1221, 130, 9112, 5, 3, 12200, 4254, 4578, 10, 1283, 1221, 130, 4313, 6, 5400, 13642, 15967, 6210, 13, 4068, 831, 7209, 5, 1377, 13315, 14156, 15967, 6210, 130, 573, 18, 9, 75, 1169, 1271, 5, 37, 7952, 4161, 66, 1246, 1637, 68, 47, 167, 1017, 16, 9806, 7, 6918, 15967, 6210, 11, 554, 18, 6646, 502, 4743, 15967, 6210, 5, 94, 6935, 215, 18, 7775, 68, 47, 167, 1017, 16, 8, 1590, 41, 

In [None]:
token_lengths.describe().to_frame().round(2)

Unnamed: 0,0
count,171332.0
mean,266.93
std,254.57
min,3.0
25%,57.0
50%,268.0
75%,406.0
max,42655.0


In [None]:
token_lengths.gt(512).value_counts().to_frame()

Unnamed: 0,0
False,154953
True,16379


## Convert qrels to TREC format

In [None]:
# load huggingface dataset and save into trec format

In [None]:
ds_qrels = datasets.load_dataset('BeIR/trec-covid-qrels')['test']
ds_qrels



  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 66336
})

In [None]:
a = ds_qrels.to_pandas().apply(lambda x: (str(x['query-id']), str(x['corpus-id'])), axis=1)
a = sorted(a)
Counter(a).most_common(10)

[(('1', '005b2j4b'), 1),
 (('1', '00fmeepz'), 1),
 (('1', '0194oljo'), 1),
 (('1', '021q9884'), 1),
 (('1', '02f0opkr'), 1),
 (('1', '02say5f1'), 1),
 (('1', '0376d6vf'), 1),
 (('1', '047xpt2c'), 1),
 (('1', '04ftw7k9'), 1),
 (('1', '05vx82oo'), 1)]

In [None]:
print(set(a) - set(b))
print(set(b) - set(a))

{('38', '9hbib8b3'), ('50', 'svo94kuo')}
set()


In [None]:
b = pd.read_csv('tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt', sep=' ', header=None).apply(lambda x: (str(x[0]), str(x[2])), axis=1)
b = sorted(b)


In [None]:
def convert_qrels_to_trec_format(output_file):
    def to_line(ex):
        # qid 0 docid score
        return '{} 0 {} {}\n'.format(ex['query-id'], ex['corpus-id'],
                                   ex['score'])
    ds = datasets.load_dataset('BeIR/trec-covid-qrels')['test']
    with open(output_file, 'w') as f:
        for item in ds.to_list():
            line = to_line(item)
            f.write(line)

QRELS_FILE = 'trec-covid-qrels_trec_format.txt'

convert_qrels_to_trec_format(QRELS_FILE)
!wc -l {QRELS_FILE}
!head {QRELS_FILE}



  0%|          | 0/1 [00:00<?, ?it/s]

66336 trec-covid-qrels_trec_format.txt
1 0 005b2j4b 2
1 0 00fmeepz 1
1 0 g7dhmyyo 2
1 0 0194oljo 1
1 0 021q9884 1
1 0 02f0opkr 1
1 0 047xpt2c 0
1 0 04ftw7k9 0
1 0 pl9ht0d0 0
1 0 05vx82oo 0


## Convert queries to TREC format

## Download pre-built index from pyserini

Following instructions, we will download the pre-built index `beir-v1.0.0-trec-covid-flat`.

In [None]:
from pyserini.index.lucene import IndexReader
from pyserini.search.lucene import LuceneSearcher
from pyserini.util import download_prebuilt_index

In [None]:
# index_reader = IndexReader.from_prebuilt_index('beir-v1.0.0-trec-covid.flat',
#                                                verbose=True)
# index_reader

Attempting to initialize pre-built index beir-v1.0.0-trec-covid.flat.
/root/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.57b812594b11d064a23123137ae7dade already exists, skipping download.
Initializing beir-v1.0.0-trec-covid.flat...
{'total_terms': 20822821, 'documents': 171331, 'non_empty_documents': 171331, 'unique_terms': 202648}
Index passes consistency checks against pre-built index 'beir-v1.0.0-trec-covid.flat'!


<pyserini.index.lucene._base.IndexReader at 0x7fa0ee8d4910>

In [None]:
PREBUILT_INDEX_DIR = download_prebuilt_index('beir-v1.0.0-trec-covid.flat',
                                             verbose=True)
print(f'PREBUILT_INDEX_DIR is {PREBUILT_INDEX_DIR}')
!ls -lht {PREBUILT_INDEX_DIR}

/root/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.57b812594b11d064a23123137ae7dade already exists, skipping download.
PREBUILT_INDEX_DIR is /root/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.57b812594b11d064a23123137ae7dade
total 258M
-rw-rw-r-- 1 2537 2603  340 Nov 16 15:21 _0.fnm
-rw-rw-r-- 1 2537 2603  19M Nov 16 15:21 _0_Lucene90_0.doc
-rw-rw-r-- 1 2537 2603  22M Nov 16 15:21 _0_Lucene90_0.pos
-rw-rw-r-- 1 2537 2603 3.6M Nov 16 15:21 _0_Lucene90_0.tim
-rw-rw-r-- 1 2537 2603  81K Nov 16 15:21 _0_Lucene90_0.tip
-rw-rw-r-- 1 2537 2603  306 Nov 16 15:21 _0_Lucene90_0.tmd
-rw-rw-r-- 1 2537 2603  516 Nov 16 15:21 _0.si
-rw-rw-r-- 1 2537 2603  154 Nov 16 15:21 segments_1
-rw-rw-r-- 1 2537 2603  242 Nov 16 15:21 _0.fdm
-rw-rw-r-- 1 2537 2603 127M Nov 16 15:21 _0.fdt
-rw-rw-r-- 1 2537 2603  12K Nov 16 15:21 _0.fdx
-rw-rw-r-- 1 2537 2603 1.4M Nov 16 15:21 _0_Lucene90_0.dvd
-rw-rw-r-- 1 2537 2603  133 Nov 16 15:21 _0_Luc

# Validating results from anserini regressions on Beir TREC Covid

We will first attempt to replicate Anserini's results for TREC-COVID. Essentially, we will follow the instructions provided in [Anserini Regressions: BEIR (v1.0.0) — TREC-COVID](https://github.com/castorini/anserini/blob/master/docs/regressions-beir-v1.0.0-trec-covid-flat.md).

Another important file is this YAML, which contains information about evaluation and index building: https://github.com/castorini/anserini/blob/master/src/main/resources/regression/beir-v1.0.0-trec-covid-flat.yaml

In [None]:
import shlex
import subprocess

TREC_EVAL_BIN_PATH = './tools/eval/trec_eval.9.0.4/trec_eval'
# Essa função foi escrita usando o github copilot
def get_trec_eval_metrics(flags, qrels_path, results_path):
    """Runs trec_eval and returns the results as a dictionary.

    Args:
        flags (str): Flags to pass to trec_eval.
        qrels_path (str): Path to the qrels file.
        results_path (str): Path to the results file.

    Returns:
        Dict[str, float]: A dictionary mapping metric names to their values.
    """
    output = subprocess.check_output([
        TREC_EVAL_BIN_PATH,
        qrels_path,
        results_path,
        *shlex.split(flags)
    ]).decode('utf-8')
    return {
        line.split()[0]: (line.split()[2])
        for line in output.splitlines()
    }
# https://github.com/castorini/anserini/blob/master/docs/regressions-beir-v1.0.0-trec-covid-flat.md
# https://github.com/castorini/anserini/blob/master/src/main/resources/regression/beir-v1.0.0-trec-covid-flat.yaml



In [None]:
!anserini/target/appassembler/bin/SearchCollection \
-index /root/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.57b812594b11d064a23123137ae7dade \
-topics anserini/src/main/resources/topics-and-qrels/topics.beir-v1.0.0-trec-covid.test.tsv.gz \
-topicreader TsvString \
-output runs/run.beir-v1.0.0-trec-covid-flat.bm25.topics.beir-v1.0.0-trec-covid.test.txt \
-bm25 -removeQuery -hits 1000

2023-04-09 05:27:03,275 INFO  [main] search.SearchCollection (SearchCollection.java:951) - Index: /root/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.57b812594b11d064a23123137ae7dade
2023-04-09 05:27:03,539 INFO  [main] search.SearchCollection (SearchCollection.java:955) - Fields: []
2023-04-09 05:27:03,540 INFO  [main] search.SearchCollection (SearchCollection.java:695) - Using DefaultEnglishAnalyzer
2023-04-09 05:27:03,541 INFO  [main] search.SearchCollection (SearchCollection.java:696) - Stemmer: porter
2023-04-09 05:27:03,543 INFO  [main] search.SearchCollection (SearchCollection.java:697) - Keep stopwords? false
2023-04-09 05:27:03,545 INFO  [main] search.SearchCollection (SearchCollection.java:698) - Stopwords file: null
2023-04-09 05:27:03,583 INFO  [main] search.SearchCollection (SearchCollection.java:1230) - runtag: Anserini
2023-04-09 05:27:08,573 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:883) - ra

In [None]:
!head runs/run.beir-v1.0.0-trec-covid-flat.bm25.topics.beir-v1.0.0-trec-covid.test.txt

1 Q0 dv9m19yk 1 7.285900 Anserini
1 Q0 0paafp5j 2 6.214600 Anserini
1 Q0 96zsd27n 3 6.214599 Anserini
1 Q0 hmvo5b0q 4 6.163500 Anserini
1 Q0 1ij25a7u 5 5.838400 Anserini
1 Q0 5d7zien3 6 5.726000 Anserini
1 Q0 xqqn1t4e 7 5.661700 Anserini
1 Q0 fqs40ivc 8 5.650400 Anserini
1 Q0 iohvj16d 9 5.638100 Anserini
1 Q0 dckuhrlf 10 5.628400 Anserini


In [None]:
#  -c: Average over the complete set of queries in the relevance judgements  
#      instead of the queries in the intersection of relevance judgements 
#      and results.  Missing queries will contribute a value of 0 to all 
#      evaluation measures (which may or may not be reasonable for a  
#      particular evaluation measure, but is reasonable for standard TREC
#      measures.) Default is off.
get_trec_eval_metrics('-c -m ndcg_cut.10', 'trec-covid-qrels_trec_format.txt', 'runs/run.beir-v1.0.0-trec-covid-flat.bm25.topics.beir-v1.0.0-trec-covid.test.txt')

{'ndcg_cut_10': '0.5947'}

The value for ndcg_cut_10 obtained match the expected one.