# SPLADE for Portuguese

Authors: Leonardo Ávila, Monique Monteiro

Inspired by https://github.com/naver/splade

This notebook contains evaluation code for a SPLADEv2 inverted index generated by a trained checkpoint.  Dataset: mRobust.

In [None]:
!nvidia-smi

Thu Jun 29 02:30:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
user = "monique" # monique
if user == "monique":
    main_dir = '/content/gdrive/MyDrive/Unicamp-projeto-final/'
else:
    main_dir = '/content/gdrive/MyDrive/Unicamp/IA368-DD/'

## Libraries installation

In [None]:
%%shell
pip install pytrec_eval
pip install git+https://github.com/naver/splade.git -q
pip install hydra-core --upgrade

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=293453 sha256=6711e17e522ae625c48c5cf440c44dd061ce7f6d486f36fab05271261d143d7e
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.7/74.7 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[



In [None]:
import pytrec_eval
pytrec_eval.supported_measures

{'11pt_avg',
 'G',
 'P',
 'Rndcg',
 'Rprec',
 'Rprec_mult',
 'binG',
 'bpref',
 'gm_bpref',
 'gm_map',
 'infAP',
 'iprec_at_recall',
 'map',
 'map_cut',
 'ndcg',
 'ndcg_cut',
 'ndcg_rel',
 'num_nonrel_judged_ret',
 'num_q',
 'num_rel',
 'num_rel_ret',
 'num_ret',
 'recall',
 'recip_rank',
 'relative_P',
 'relstring',
 'runid',
 'set_F',
 'set_P',
 'set_map',
 'set_recall',
 'set_relative_P',
 'success',
 'utility'}

## Copying data files to the expected data structure

In [None]:
!git clone https://github.com/naver/splade.git

Cloning into 'splade'...
remote: Enumerating objects: 703, done.[K
remote: Counting objects:   0% (1/236)[Kremote: Counting objects:   1% (3/236)[Kremote: Counting objects:   2% (5/236)[Kremote: Counting objects:   3% (8/236)[Kremote: Counting objects:   4% (10/236)[Kremote: Counting objects:   5% (12/236)[Kremote: Counting objects:   6% (15/236)[Kremote: Counting objects:   7% (17/236)[Kremote: Counting objects:   8% (19/236)[Kremote: Counting objects:   9% (22/236)[Kremote: Counting objects:  10% (24/236)[Kremote: Counting objects:  11% (26/236)[Kremote: Counting objects:  12% (29/236)[Kremote: Counting objects:  13% (31/236)[Kremote: Counting objects:  14% (34/236)[Kremote: Counting objects:  15% (36/236)[Kremote: Counting objects:  16% (38/236)[Kremote: Counting objects:  17% (41/236)[Kremote: Counting objects:  18% (43/236)[Kremote: Counting objects:  19% (45/236)[Kremote: Counting objects:  20% (48/236)[Kremote: Counting objects:  21% (5

In [None]:
import shutil
import os

shutil.copyfile(f"{main_dir}Projeto Final/data.zip", "/content/data.zip")
!unzip data.zip

Archive:  data.zip
   creating: content/splade_data/
   creating: content/splade_data/.ipynb_checkpoints/
   creating: content/splade_data/m_robust/
   creating: content/splade_data/m_robust/.ipynb_checkpoints/
  inflating: content/splade_data/m_robust/queries.tsv  
  inflating: content/splade_data/m_robust/qrels.robust04.txt  
  inflating: content/splade_data/m_robust/corpus.tsv  
   creating: content/splade_data/m_marco/
  inflating: content/splade_data/m_marco/queries_train.tsv  
  inflating: content/splade_data/m_marco/queries_dev.tsv  
  inflating: content/splade_data/m_marco/corpus.tsv  
  inflating: content/splade_data/m_marco/queries_dev.full.tsv  


In [None]:
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/config_splade_pt.yaml" splade/conf/

In [None]:
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/data/pt.yaml" splade/conf/train/data

### Copying full collection

For indexing we'll validate the model on mROBUST dataset.  So, its corpus should be used as full collection.

In [None]:
!mkdir -p splade/data/pt/full_collection

Preprocesses mROBUST dataset in order to replace alphanumerical IDs with integer ones.

In [None]:
import pandas as pd

#df_mrobust_corpus = pd.read_csv("content/data/m_robust/corpus.tsv", sep='\t', header=None)
#df_mrobust_corpus.head()
map_id_to_text = dict()

with open("content/splade_data/m_robust/corpus.tsv", 'r') as file:
  for line in file:
    fields = line.split('\t')
    id = fields[0]
    text = fields[1]
    map_id_to_text[id] = text

In [None]:
doc_ids = map_id_to_text.keys()

In [None]:
map_id_to_int = {value:index for index, value in enumerate(doc_ids)}

In [None]:
map_id_to_int['FT931-12718']

654

In [None]:
len(map_id_to_text)

528032

In [None]:
new_map_id_to_text = {map_id_to_int[id]: text for id, text in map_id_to_text.items()}

In [None]:
len(new_map_id_to_text)

528032

In [None]:
with open('new_corpus.tsv', 'w') as file:
  for id, text in new_map_id_to_text.items():
    file.write(f"{id}\t{text}")

In [None]:
!head content/splade_data/m_robust/corpus.tsv

FT931-16655	930105 FT 05 JAN 93 / Redland concorda acordos europeus de 59 milhões de libras REDLAND, o grupo de materiais de construção que pagou 624 milhões de libras em ações pela Steetley no ano passado, arrecadou 100 milhões de libras por meio de alienações. Está reinvestindo 58,6 milhões de libras em suas atividades de azulejos e tijolos na Europa continental. A Redland está formando uma joint venture com a Koramic, uma empresa privada belga, que engloba as fábricas de tijolos das duas empresas na Holanda, Alemanha e Bélgica. A nova empresa será a maior fabricante de tijolos de fachada da Europa continental. Os lucros das atividades combinadas em 1992 teriam sido de 15,9 milhões de libras, dos quais a parte de Redland teria sido de 5,8 milhões de libras. A joint venture 50:50 deterá 70 por cento das atividades combinadas, dando a Redland uma participação econômica de 35 por cento, mas controle conjunto. Redland também receberá 17,3 milhões de libras em dinheiro. A Redland está aum

In [None]:
!head new_corpus.tsv

0	930105 FT 05 JAN 93 / Redland concorda acordos europeus de 59 milhões de libras REDLAND, o grupo de materiais de construção que pagou 624 milhões de libras em ações pela Steetley no ano passado, arrecadou 100 milhões de libras por meio de alienações. Está reinvestindo 58,6 milhões de libras em suas atividades de azulejos e tijolos na Europa continental. A Redland está formando uma joint venture com a Koramic, uma empresa privada belga, que engloba as fábricas de tijolos das duas empresas na Holanda, Alemanha e Bélgica. A nova empresa será a maior fabricante de tijolos de fachada da Europa continental. Os lucros das atividades combinadas em 1992 teriam sido de 15,9 milhões de libras, dos quais a parte de Redland teria sido de 5,8 milhões de libras. A joint venture 50:50 deterá 70 por cento das atividades combinadas, dando a Redland uma participação econômica de 35 por cento, mas controle conjunto. Redland também receberá 17,3 milhões de libras em dinheiro. A Redland está aumentando su

In [None]:
!wc -l content/splade_data/m_robust/corpus.tsv

528032 content/splade_data/m_robust/corpus.tsv


In [None]:
!wc -l new_corpus.tsv

528032 new_corpus.tsv


In [None]:
!cp new_corpus.tsv splade/data/pt/full_collection/raw.tsv

In [None]:
!head splade/data/pt/full_collection/raw.tsv

0	930105 FT 05 JAN 93 / Redland concorda acordos europeus de 59 milhões de libras REDLAND, o grupo de materiais de construção que pagou 624 milhões de libras em ações pela Steetley no ano passado, arrecadou 100 milhões de libras por meio de alienações. Está reinvestindo 58,6 milhões de libras em suas atividades de azulejos e tijolos na Europa continental. A Redland está formando uma joint venture com a Koramic, uma empresa privada belga, que engloba as fábricas de tijolos das duas empresas na Holanda, Alemanha e Bélgica. A nova empresa será a maior fabricante de tijolos de fachada da Europa continental. Os lucros das atividades combinadas em 1992 teriam sido de 15,9 milhões de libras, dos quais a parte de Redland teria sido de 5,8 milhões de libras. A joint venture 50:50 deterá 70 por cento das atividades combinadas, dando a Redland uma participação econômica de 35 por cento, mas controle conjunto. Redland também receberá 17,3 milhões de libras em dinheiro. A Redland está aumentando su

In [None]:
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/index/pt.yaml" splade/conf/index

### Retrieval and evaluation configuration

Here, we'll configure retrieval and evaluation for mROBUST dataset.

In [None]:
!mkdir -p splade/data/pt/dev_queries

In [None]:
!cp content/splade_data/m_robust/queries.tsv splade/data/pt/dev_queries/raw.tsv

Converts QREL information to JSON fomat.

In [None]:
#TODO: Devo fazer o dump apenas dos que têm relevância 1?
from collections import defaultdict
import json

qrel = defaultdict(dict)

with open("content/splade_data/m_robust/qrels.robust04.txt", 'r') as file:
  for line in file:
    fields = line.split()
    q_id = fields[0]
    if fields[2] in map_id_to_int:
      doc_id = map_id_to_int[fields[2]]
      rel = fields[3]
      qrel[q_id][doc_id] = int(rel)

with open('splade/data/pt/dev_qrel.json', 'w') as file:
    json.dump(qrel, file)

In [None]:
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/retrieve_evaluate/pt.yaml" splade/conf/retrieve_evaluate

### Flops estimation

We'll use the same test set (mRobust) to estimate flops.

In [None]:
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/flops/pt.yaml" splade/conf/flops

## Indexing and retrieving

In [None]:
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/model/splade_bertimbau_base.yaml" splade/conf/train/model/

In [None]:
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/config_splade_pt.yaml" splade/conf/
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/config/splade_pt.yaml" splade/conf/train/config

In [None]:
shutil.copyfile(f"{main_dir}Projeto Final/experiments_ids.zip", "/content/experiments.zip")

'/content/experiments.zip'

In [None]:
!unzip /content/experiments.zip
!mv /content/content/splade/experiments /content/experiments

Archive:  /content/experiments.zip
   creating: content/splade/experiments/
   creating: content/splade/experiments/pt/
   creating: content/splade/experiments/pt/checkpoint/
  inflating: content/splade/experiments/pt/checkpoint/training_perf.txt  
   creating: content/splade/experiments/pt/checkpoint/tensorboard/
  inflating: content/splade/experiments/pt/checkpoint/tensorboard/events.out.tfevents.1687731900.0f580f3bff85.9183.0  
  inflating: content/splade/experiments/pt/checkpoint/tensorboard/events.out.tfevents.1687650276.575675fd9b16.9093.0  
  inflating: content/splade/experiments/pt/checkpoint/tensorboard/events.out.tfevents.1687705289.575675fd9b16.235074.0  
  inflating: content/splade/experiments/pt/checkpoint/config.yaml  
   creating: content/splade/experiments/pt/checkpoint/model/
  inflating: content/splade/experiments/pt/checkpoint/model/pytorch_model.bin  
  inflating: content/splade/experiments/pt/checkpoint/model/special_tokens_map.json  
  inflating: content/splade/ex

In [None]:
!cp -r experiments/pt splade/experiments

In [None]:
!mkdir -p splade/experiments/pt
!mv splade/experiments/checkpoint splade/experiments/pt

In [None]:
shutil.copyfile(f"{main_dir}Projeto Final/index_ids.zip", "/content/index.zip")

'/content/index.zip'

In [None]:
!unzip /content/index.zip
#!mv /content/content/splade/experiments /content/experiments

Archive:  /content/index.zip
   creating: splade/experiments/pt/index/
  inflating: splade/experiments/pt/index/doc_ids.pkl  
  inflating: splade/experiments/pt/index/index_dist.json  
  inflating: splade/experiments/pt/index/array_index.h5py  
 extracting: splade/experiments/pt/index/index_stats.json  


In [None]:
!mkdir -p experiments/pt/out/other_dataset

In [None]:
%%shell
cd splade/
export PYTHONPATH=$PYTHONPATH:$(pwd)
export SPLADE_CONFIG_NAME="config_splade_pt.yaml"
python3 -m splade.retrieve \
  config.checkpoint_dir=experiments/pt/checkpoint \
  config.index_dir=experiments/pt/index \
  config.out_dir=experiments/pt/out

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path=CONFIG_PATH, config_name=CONFIG_NAME)
The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path=CONFIG_PATH, config_name=CONFIG_NAME)
See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(
config:
  lr: 2.0e-05
  seed: 123
  gradient_accumulation_steps: 1
  weight_decay: 0.01
  validation_metrics:
  - MRR@10
  - recall@100
  - recall@200
  - recall@500
  pretrained_no_yamlconfig: false
  nb_iterations: 150000
  train_batch_size: 16
  eval_batch_size: 16
  index_retrieve_batch_size: 16
  record_frequency: 10000
  train_monitoring_freq: 500
  warmup_steps: 6000
  max_length: 256
  fp16: false
  matching_type: splade
  monitoring_ckpt: MRR@10
  loss: InBatchPairwis



In [None]:
!zip -r out_ids.zip splade/experiments/pt/out

  adding: splade/experiments/pt/out/ (stored 0%)
  adding: splade/experiments/pt/out/other_dataset/ (stored 0%)
  adding: splade/experiments/pt/out/other_dataset/run.json (deflated 61%)
  adding: splade/experiments/pt/out/other_dataset/stats/ (stored 0%)
  adding: splade/experiments/pt/out/other_dataset/stats/q_stats.json (stored 0%)
  adding: splade/experiments/pt/out/other_dataset/perf.json (deflated 59%)
  adding: splade/experiments/pt/out/perf_all_datasets.json (deflated 57%)


In [None]:
shutil.copyfile("out_ids.zip", f"{main_dir}Projeto Final/out_ids.zip",)

'/content/gdrive/MyDrive/Unicamp-projeto-final/Projeto Final/out_ids.zip'