# T5 + doc2query - Indexing

Author: Monique Monteiro (moniquelouise@gmail.com)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


## Libraries installation

In [None]:
!pip install pyserini

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [None]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [None]:
!pip install -q condacolab

[0m

In [None]:
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
!conda install faiss-cpu -c pytorch

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done

# All requested packages already installed.



In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [None]:
!pip install trectools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

## Index building

In [None]:
from pyserini.search.lucene import LuceneSearcher

In [None]:
LuceneSearcher.from_prebuilt_index('beir-v1.0.0-trec-covid-flat')

<pyserini.search.lucene._searcher.LuceneSearcher at 0x7fe2918d1a90>

In [None]:
!ls ~/.cache/pyserini/indexes/

lucene-index.beir-v1.0.0-trec-covid-flat.20221116.505594.9ae06c30a7c352f18a5a8e75b88b9106


## Dataset download and preparation



In [None]:
!wget https://huggingface.co/datasets/BeIR/trec-covid/resolve/main/queries.jsonl.gz

--2023-04-05 00:29:53--  https://huggingface.co/datasets/BeIR/trec-covid/resolve/main/queries.jsonl.gz
Resolving huggingface.co (huggingface.co)... 34.230.159.254, 34.206.0.154, 52.22.128.237, ...
Connecting to huggingface.co (huggingface.co)|34.230.159.254|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/a8/10/a810e88b0e7b233be82b89c1fa6ec2d75efc6d55784c2ada9dcac8434a634f3a/9eadcc2cdf140addc9dae83648bb2c6611f5e4b66eaed7475fa5a0ca48eda371?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27queries.jsonl.gz%3B+filename%3D%22queries.jsonl.gz%22%3B&response-content-type=application%2Fgzip&Expires=1680913793&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZG4tbGZzLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2E4LzEwL2E4MTBlODhiMGU3YjIzM2JlODJiODljMWZhNmVjMmQ3NWVmYzZkNTU3ODRjMmFkYTlkY2FjODQzNGE2MzRmM2EvOWVhZGNjMmNkZjE0MGFkZGM5ZGFlODM2NDhiYjJjNjYxMWY1ZTRiNjZlYWVkNzQ3NWZhNWEwY2E0OGVkYTM3MT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0

In [None]:
main_dir = "/content/gdrive/MyDrive/Unicamp-aula-6-3"

In [None]:
!mv queries.jsonl.gz {main_dir}/trec-covid

In [None]:
!gunzip {main_dir}/trec-covid/queries.jsonl.gz

In [None]:
import json

with open(f'{main_dir}/trec-covid/queries.tsv','w') as f_out:
  with open(f'{main_dir}/trec-covid/queries.jsonl', 'r') as f:
    for line in f:
      data = json.loads(line)
      id = data['_id']
      text = data['text']
      f_out.write(f'{id}\t{text}\n')
        


In [None]:
topics = "trec-covid"

In [None]:
!wget https://huggingface.co/datasets/BeIR/trec-covid-qrels/raw/main/test.tsv

In [None]:
!mv test.tsv {main_dir}/trec-covid

In [None]:
import pandas as pd

qrel = pd.read_csv(f"{main_dir}/trec-covid/test.tsv", sep="\t", header=None, 
                   skiprows=1, names=["query", "docid", "rel"])
qrel["q0"] = "q0"
qrel = qrel.to_dict(orient="list")

In [None]:
run = pd.read_csv(f"{main_dir}/runs/run.{topics}.bm25tuned.txt", sep="\s+", 
                  names=["query", "q0", "docid", "rank", "score", "system"])
run = run.to_dict(orient="list")

## Searching with BM25

In [None]:
!python -m pyserini.search.lucene \
  --index ~/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid-flat.20221116.505594.9ae06c30a7c352f18a5a8e75b88b9106 \
  --topics {main_dir}/{topics}/queries.tsv \
  --output {main_dir}/runs/run.{topics}.bm25tuned.txt \
  --hits 1000 \
  --bm25 --k1 0.82 --b 0.68

Setting BM25 parameters: k1=0.82, b=0.68
Running /content/gdrive/MyDrive/Unicamp-aula-6-3/trec-covid/queries.tsv topics, saving to /content/gdrive/MyDrive/Unicamp-aula-6-3/runs/run.trec-covid.bm25tuned.txt...
100% 50/50 [00:05<00:00,  8.54it/s]


In [None]:
from evaluate import load
trec_eval = load("trec_eval")
results = trec_eval.compute(predictions=[run], references=[qrel])

In [None]:
results['NDCG@10']

0.5963435398557583