# SPLADE for Portuguese

Authors: Leonardo Ávila, Monique Monteiro

Inspired by https://github.com/naver/splade

This notebook contains training code for SPLADEv2 model for Portuguese language.  Dataset: mMARCO (MSCARCO automatically translated to Portuguese)

In [None]:
!nvidia-smi

Sun Jun 25 22:05:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    42W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


## Set main_dir

In [None]:
user = "leo" # monique
if user == "monique":
    main_dir = '/content/gdrive/MyDrive/Unicamp-projeto-final/'
else:
    main_dir = '/content/gdrive/MyDrive/Unicamp/IA368-DD/'

## Libraries installation

In [None]:
%%shell
pip install pytrec_eval
pip install git+https://github.com/leobavila/splade.git -q
pip install hydra-core --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=293481 sha256=2838c4d3b3d936999f98d967c0d2f1b49db1b320438c421334e5429e44e7861f
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.7/74.7 kB[0m [31m12.1 MB/s[0m 



## Libraries Import

In [None]:
import os
import csv
import tqdm
import json
import shutil
import pytrec_eval
import pandas as pd
from collections import defaultdict

## Copying data files to the expected data structure

### Clone repository

In [None]:
!git clone https://github.com/leobavila/splade.git

Cloning into 'splade'...
remote: Enumerating objects: 449, done.[K
remote: Counting objects: 100% (202/202), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 449 (delta 142), reused 94 (delta 93), pack-reused 247[K
Receiving objects: 100% (449/449), 3.08 MiB | 35.39 MiB/s, done.
Resolving deltas: 100% (210/210), done.


In [None]:
%%shell
cd splade/
git pull

Already up to date.




### Settings

In [None]:
# create directories
!mkdir -p splade/data/pt/triplets
!mkdir -p splade/data/pt/val_retrieval/collection
!mkdir -p splade/data/pt/val_retrieval/queries
!mkdir -p splade/data/pt/full_collection
!mkdir -p splade/data/pt/dev_queries
!mkdir -p "content/data/m_marco/val_retrieval/collection"
!mkdir -p "content/data/m_marco/val_retrieval/queries"

In [None]:
# get datasets
shutil.copyfile(f"{main_dir}Projeto Final/data.zip", "/content/data.zip")
!unzip data.zip
!mv /content/content/data /content/data
!rm -rf /content/content/

# download triplets ids
!wget https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/triples.train.ids.small.tsv

# get data_m_marco_val_retrieval
shutil.copyfile(f"{main_dir}Projeto Final/data_m_marco_val_retrieval.zip", "/content/data_m_marco_val_retrieval.zip")
!unzip data_m_marco_val_retrieval.zip

Archive:  data.zip
  inflating: content/data/m_marco/queries_train.tsv  
   creating: content/data/m_marco/.ipynb_checkpoints/
  inflating: content/data/m_marco/triplets.tsv  
  inflating: content/data/m_marco/queries_dev.full.tsv  
  inflating: content/data/m_marco/queries_dev.tsv  
  inflating: content/data/m_marco/corpus.tsv  
   creating: content/data/m_robust/
  inflating: content/data/m_robust/qrels.robust04.txt  
   creating: content/data/m_robust/.ipynb_checkpoints/
  inflating: content/data/m_robust/queries.tsv  
  inflating: content/data/m_robust/corpus.tsv  
--2023-06-25 22:14:48--  https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/triples.train.ids.small.tsv
Resolving huggingface.co (huggingface.co)... 13.224.249.43, 13.224.249.44, 13.224.249.10, ...
Connecting to huggingface.co (huggingface.co)|13.224.249.43|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/datasets/unicamp-dl/mmarco/2263e5c34e97

In [None]:
# move file to directories
!cp /content/data/m_marco/corpus.tsv /content/splade/data/pt/triplets/corpus.tsv
!cp /content/data/m_marco/queries_train.tsv /content/splade/data/pt/triplets/queries_train.tsv
!cp /content/triples.train.ids.small.tsv /content/splade/data/pt/triplets/raw.tsv
!cp /content/data/m_robust/corpus.tsv /content/splade/data/pt/full_collection/raw.tsv
!cp /content/data/m_robust/queries.tsv /content/splade/data/pt/dev_queries/raw.tsv

In [None]:
# get configs
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/config_splade_pt.yaml" splade/conf/
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/config/splade_pt.yaml" splade/conf/train/config
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/data/pt.yaml" splade/conf/train/data
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/train/model/splade_bertimbau_base.yaml" splade/conf/train/model/
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/index/pt.yaml" splade/conf/index
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/retrieve_evaluate/pt.yaml" splade/conf/retrieve_evaluate
!cp "{main_dir}Projeto Final/Experimentos/splade/conf/flops/pt.yaml" splade/conf/flops

### Generating/copying validation collection

Now we need to generate a validation collection: it will be based on the same validation data used in https://github.com/naver/splade.

In [None]:
#Lookups the respective translated passages.
def gen_val_collection():
  df_val_collection = pd.read_csv(f"{main_dir}Projeto Final/msmarco/val_retrieval/collection/raw.tsv", sep='\t', header=None)
  df_corpus = pd.read_csv("content/data/m_marco/corpus.tsv", sep='\t', header=None)

  with open("content/data/m_marco/val_retrieval/collection/raw.tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t')  # Set the delimiter as a tab

    for index, row in tqdm.tqdm(df_val_collection.iterrows(), total=df_val_collection.shape[0]):
      doc_id = row[0]
      doc_df = df_corpus[df_corpus[0] == doc_id]
      if doc_df.shape[0] >= 1:
        doc_text = doc_df.iloc[0,1]
        writer.writerow([doc_id, doc_text])

In [None]:
!cp content/data/m_marco/val_retrieval/collection/raw.tsv splade/data/pt/val_retrieval/collection

### Generating/copying validation queries

In [None]:
def gen_val_queries():
  df_val_queries = pd.read_csv(f"{main_dir}Projeto Final/msmarco/val_retrieval/queries/raw.tsv", sep='\t', header=None)
  df_queries_dev_full = pd.read_csv("content/data/m_marco/queries_dev.full.tsv", sep='\t', header=None)

  with open("content/data/m_marco/val_retrieval/queries/raw.tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t')  # Set the delimiter as a tab

    for index, row in tqdm.tqdm(df_val_queries.iterrows(), total=df_val_queries.shape[0]):
      q_id = row[0]
      q_df = df_queries_dev_full[df_queries_dev_full[0] == q_id]
      if q_df.shape[0] >= 1:
        q_text = q_df.iloc[0,1]
        writer.writerow([q_id, q_text])

In [None]:
!cp content/data/m_marco/val_retrieval/queries/raw.tsv splade/data/pt/val_retrieval/queries

### Copying QREL validation data

In [None]:
!cp "{main_dir}Projeto Final/msmarco/val_retrieval/qrel.json" splade/data/pt/val_retrieval

### Converts QREL information to JSON fomat.

In [None]:
qrel = defaultdict(dict)

with open("/content/data/m_robust/qrels.robust04.txt", 'r') as file:
  for line in file:
    fields = line.split()
    q_id = fields[0]
    doc_id = fields[2]
    rel = fields[3]
    qrel[q_id][doc_id] = rel

with open('/content/splade/data/pt/dev_qrel.json', 'w') as file:
    json.dump(qrel, file)

## Training

In [None]:
resume_training = True
if resume_training:
    shutil.copyfile(f"{main_dir}Projeto Final/experiments.zip", "/content/experiments.zip")
    !unzip /content/experiments.zip
    !mv /content/content/splade/experiments /content/splade

In [None]:
%%shell
cd splade/
export PYTHONPATH=$PYTHONPATH:$(pwd)
export SPLADE_CONFIG_NAME="config_splade_pt.yaml"
python3 -m splade.train_from_triplets_ids \
  config.checkpoint_dir=experiments/pt/checkpoint \
  config.index_dir=experiments/pt/index \
  config.out_dir=experiments/pt/out

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 35% 6080/17269 [05:20<09:20, 19.97it/s][A
 35% 6083/17269 [05:20<09:24, 19.82it/s][A
 35% 6086/17269 [05:21<09:33, 19.49it/s][A
 35% 6089/17269 [05:21<09:23, 19.84it/s][A
 35% 6091/17269 [05:21<09:32, 19.54it/s][A
 35% 6093/17269 [05:21<09:30, 19.58it/s][A
 35% 6095/17269 [05:21<09:44, 19.10it/s][A
 35% 6098/17269 [05:21<09:19, 19.95it/s][A
 35% 6101/17269 [05:21<09:25, 19.76it/s][A
 35% 6103/17269 [05:21<09:32, 19.51it/s][A
 35% 6105/17269 [05:22<09:52, 18.84it/s][A
 35% 6107/17269 [05:22<10:14, 18.16it/s][A
 35% 6109/17269 [05:22<10:18, 18.04it/s][A
 35% 6111/17269 [05:22<10:19, 18.00it/s][A
 35% 6113/17269 [05:22<10:14, 18.14it/s][A
 35% 6115/17269 [05:22<10:19, 18.00it/s][A
 35% 6118/17269 [05:22<09:37, 19.32it/s][A
 35% 6121/17269 [05:22<09:44, 19.07it/s][A
 35% 6123/17269 [05:23<09:55, 18.73it/s][A
 35% 6125/17269 [05:23<09:51, 18.85it/s][A
 35% 6127/17269 [05:23<09:41, 19.15it/s][A
 35% 6130/1



In [None]:
!zip -r experiments_ids.zip /content/splade/experiments

  adding: content/splade/experiments/ (stored 0%)
  adding: content/splade/experiments/pt/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/training_perf.txt (deflated 55%)
  adding: content/splade/experiments/pt/checkpoint/tensorboard/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/tensorboard/events.out.tfevents.1687731900.0f580f3bff85.9183.0 (deflated 62%)
  adding: content/splade/experiments/pt/checkpoint/tensorboard/events.out.tfevents.1687650276.575675fd9b16.9093.0 (deflated 63%)
  adding: content/splade/experiments/pt/checkpoint/tensorboard/events.out.tfevents.1687705289.575675fd9b16.235074.0 (deflated 62%)
  adding: content/splade/experiments/pt/checkpoint/config.yaml (deflated 56%)
  adding: content/splade/experiments/pt/checkpoint/model/ (stored 0%)
  adding: content/splade/experiments/pt/checkpoint/model/pytorch_model.bin (deflated 7%)
  adding: content/splade/experiments/pt/checkpo

In [None]:
shutil.copyfile("/content/experiments_ids.zip", f"{main_dir}Projeto Final/experiments_ids.zip",)

'/content/gdrive/MyDrive/Unicamp/IA368-DD/Projeto Final/experiments_ids.zip'