# SPLADE for Portuguese

Authors: Leonardo Ávila, Monique Monteiro

Inspired by https://github.com/naver/splade

This notebook contains training code for SPLADEv2 model for Portuguese language.  Dataset: mMARCO (MSCARCO automatically translated to Portuguese)

In [None]:
!nvidia-smi

Mon Jun 26 22:43:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


## Set main_dir

In [None]:
user = "leo" # monique
if user == "monique":
    main_dir = '/content/gdrive/MyDrive/Unicamp-projeto-final/'
else:
    main_dir = '/content/gdrive/MyDrive/Unicamp/IA368-DD/'

## Libraries installation

In [None]:
%%shell
pip install pytrec_eval
pip install git+https://github.com/leobavila/splade.git -q
pip install hydra-core --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=293462 sha256=55a9031abd33b7bf82cb21d086834cbcfc254c1ac80fd0792fc1e5db2c15bc2a
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.7/74.7 kB[0m [31m10.5 MB/s[0m 



## Libraries Import

In [None]:
import os
import csv
import tqdm
import json
import shutil
import pytrec_eval
import pandas as pd
from collections import defaultdict

## Copying data files to the expected data structure

### Clone repository

In [None]:
!git clone https://github.com/leobavila/splade.git

Cloning into 'splade'...
remote: Enumerating objects: 449, done.[K
remote: Counting objects: 100% (204/204), done.[K
remote: Compressing objects: 100% (108/108), done.[K
remote: Total 449 (delta 144), reused 97 (delta 96), pack-reused 245[K
Receiving objects: 100% (449/449), 3.08 MiB | 38.89 MiB/s, done.
Resolving deltas: 100% (211/211), done.


In [None]:
%%shell
cd splade/
git pull

Already up to date.




### Settings

In [None]:
# create directories
!mkdir -p splade/data/pt/triplets
!mkdir -p splade/data/pt/val_retrieval/collection
!mkdir -p splade/data/pt/val_retrieval/queries
!mkdir -p splade/data/pt/full_collection
!mkdir -p splade/data/pt/dev_queries
!mkdir -p "content/data/m_marco/val_retrieval/collection"
!mkdir -p "content/data/m_marco/val_retrieval/queries"

In [None]:
# get datasets
shutil.copyfile(f"{main_dir}Projeto Final/data.zip", "/content/data.zip")
!unzip data.zip
!mv /content/content/data /content/data
!rm -rf /content/content/

# download triplets ids
!wget https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/triples.train.ids.small.tsv

# get data_m_marco_val_retrieval
shutil.copyfile(f"{main_dir}Projeto Final/data_m_marco_val_retrieval.zip", "/content/data_m_marco_val_retrieval.zip")
!unzip data_m_marco_val_retrieval.zip

Archive:  data.zip
  inflating: content/data/m_marco/queries_train.tsv  
   creating: content/data/m_marco/.ipynb_checkpoints/
  inflating: content/data/m_marco/triplets.tsv  
  inflating: content/data/m_marco/queries_dev.full.tsv  
  inflating: content/data/m_marco/queries_dev.tsv  
  inflating: content/data/m_marco/corpus.tsv  
   creating: content/data/m_robust/
  inflating: content/data/m_robust/qrels.robust04.txt  
   creating: content/data/m_robust/.ipynb_checkpoints/
  inflating: content/data/m_robust/queries.tsv  
  inflating: content/data/m_robust/corpus.tsv  
--2023-06-26 22:51:21--  https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/triples.train.ids.small.tsv
Resolving huggingface.co (huggingface.co)... 13.33.33.102, 13.33.33.20, 13.33.33.110, ...
Connecting to huggingface.co (huggingface.co)|13.33.33.102|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/datasets/unicamp-dl/mmarco/2263e5c34e97b0faf

In [None]:
# move file to directories
!cp /content/data/m_marco/corpus.tsv /content/splade/data/pt/triplets/corpus.tsv
!cp /content/data/m_marco/queries_train.tsv /content/splade/data/pt/triplets/queries_train.tsv
!cp /content/triples.train.ids.small.tsv /content/splade/data/pt/triplets/raw.tsv
!cp /content/data/m_robust/corpus.tsv /content/splade/data/pt/full_collection/raw.tsv
!cp /content/data/m_robust/queries.tsv /content/splade/data/pt/dev_queries/raw.tsv

In [None]:
# get configs
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/config_splade_pt.yaml" splade/conf/
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/config/splade_pt.yaml" splade/conf/train/config
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/data/pt.yaml" splade/conf/train/data
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/model/splade_bertimbau_base.yaml" splade/conf/train/model/
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/index/pt.yaml" splade/conf/index
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/retrieve_evaluate/pt.yaml" splade/conf/retrieve_evaluate
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/flops/pt.yaml" splade/conf/flops

### Generating/copying validation collection

Now we need to generate a validation collection: it will be based on the same validation data used in https://github.com/naver/splade.

In [None]:
#Lookups the respective translated passages.
def gen_val_collection():
  df_val_collection = pd.read_csv(f"{main_dir}Projeto Final/msmarco/val_retrieval/collection/raw.tsv", sep='\t', header=None)
  df_corpus = pd.read_csv("content/data/m_marco/corpus.tsv", sep='\t', header=None)

  with open("content/data/m_marco/val_retrieval/collection/raw.tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t')  # Set the delimiter as a tab

    for index, row in tqdm.tqdm(df_val_collection.iterrows(), total=df_val_collection.shape[0]):
      doc_id = row[0]
      doc_df = df_corpus[df_corpus[0] == doc_id]
      if doc_df.shape[0] >= 1:
        doc_text = doc_df.iloc[0,1]
        writer.writerow([doc_id, doc_text])

In [None]:
!cp content/data/m_marco/val_retrieval/collection/raw.tsv splade/data/pt/val_retrieval/collection

### Generating/copying validation queries

In [None]:
def gen_val_queries():
  df_val_queries = pd.read_csv(f"{main_dir}Projeto Final/msmarco/val_retrieval/queries/raw.tsv", sep='\t', header=None)
  df_queries_dev_full = pd.read_csv("content/data/m_marco/queries_dev.full.tsv", sep='\t', header=None)

  with open("content/data/m_marco/val_retrieval/queries/raw.tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t')  # Set the delimiter as a tab

    for index, row in tqdm.tqdm(df_val_queries.iterrows(), total=df_val_queries.shape[0]):
      q_id = row[0]
      q_df = df_queries_dev_full[df_queries_dev_full[0] == q_id]
      if q_df.shape[0] >= 1:
        q_text = q_df.iloc[0,1]
        writer.writerow([q_id, q_text])

In [None]:
!cp content/data/m_marco/val_retrieval/queries/raw.tsv splade/data/pt/val_retrieval/queries

### Copying QREL validation data

In [None]:
!cp "{main_dir}Projeto Final/msmarco/val_retrieval/qrel.json" splade/data/pt/val_retrieval

### Converts QREL information to JSON fomat.

In [None]:
qrel = defaultdict(dict)

with open("/content/data/m_robust/qrels.robust04.txt", 'r') as file:
  for line in file:
    fields = line.split()
    q_id = fields[0]
    doc_id = fields[2]
    rel = fields[3]
    qrel[q_id][doc_id] = rel

with open('/content/splade/data/pt/dev_qrel.json', 'w') as file:
    json.dump(qrel, file)

## Training

In [None]:
resume_training = False
if resume_training:
    shutil.copyfile(f"{main_dir}Projeto Final/experiments.zip", "/content/experiments.zip")
    !unzip /content/experiments.zip
    !mv /content/content/splade/experiments /content/splade

In [None]:
%%shell
cd splade/
export PYTHONPATH=$PYTHONPATH:$(pwd)
export SPLADE_CONFIG_NAME="config_splade_pt.yaml"
python3 -m splade.train_from_triplets_ids \
  config.checkpoint_dir=experiments/pt/checkpoint \
  config.index_dir=experiments/pt/index \
  config.out_dir=experiments/pt/out

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 38% 6483/17269 [05:51<09:15, 19.41it/s][A
 38% 6485/17269 [05:51<09:30, 18.91it/s][A
 38% 6487/17269 [05:51<09:44, 18.45it/s][A
 38% 6489/17269 [05:51<09:52, 18.18it/s][A
 38% 6491/17269 [05:52<10:34, 16.98it/s][A
 38% 6494/17269 [05:52<09:59, 17.96it/s][A
 38% 6496/17269 [05:52<10:01, 17.92it/s][A
 38% 6499/17269 [05:52<09:28, 18.96it/s][A
 38% 6501/17269 [05:52<09:24, 19.08it/s][A
 38% 6503/17269 [05:52<09:37, 18.65it/s][A
 38% 6505/17269 [05:52<09:51, 18.19it/s][A
 38% 6507/17269 [05:52<10:14, 17.53it/s][A
 38% 6509/17269 [05:53<10:01, 17.89it/s][A
 38% 6511/17269 [05:53<09:57, 18.02it/s][A
 38% 6514/17269 [05:53<09:29, 18.89it/s][A
 38% 6516/17269 [05:53<09:38, 18.60it/s][A
 38% 6519/17269 [05:53<08:42, 20.59it/s][A
 38% 6522/17269 [05:53<09:07, 19.62it/s][A
 38% 6524/17269 [05:53<09:13, 19.42it/s][A
 38% 6526/17269 [05:53<09:25, 19.00it/s][A
 38% 6528/17269 [05:54<10:04, 17.76it/s][A
 38% 6530/1



In [None]:
!cp /content/splade/conf/train/config/splade_pt.yaml /content/splade/experiments
!zip -r experiments_ids_384_max.zip /content/splade/experiments

  adding: content/splade/experiments/ (stored 0%)
  adding: content/splade/experiments/pt/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/tensorboard/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/tensorboard/events.out.tfevents.1687820661.541f75b1c4d4.7937.0 (deflated 62%)
  adding: content/splade/experiments/pt/checkpoint/validation_perf.txt (deflated 61%)
  adding: content/splade/experiments/pt/checkpoint/model/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/model/config.json (deflated 53%)
  adding: content/splade/experiments/pt/checkpoint/model/tokenizer_config.json (deflated 35%)
  adding: content/splade/experiments/pt/checkpoint/model/vocab.txt (deflated 52%)
  adding: content/splade/experiments/pt/checkpoint/model/tokenizer.json (deflated 72%)
  adding: content/splade/experiments/pt/checkpoint/model/pytorch_model.bin (deflated 7%)
  adding: content/splade/experiments

In [None]:
shutil.copyfile("/content/experiments_ids_384_max.zip", f"{main_dir}Projeto Final/experiments_ids_384_max.zip",)

'/content/gdrive/MyDrive/Unicamp/IA368-DD/Projeto Final/experiments_ids_384_max.zip'