# InPars - Finetuning

Author: Monique Monteiro (moniquelouise@gmail.com)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [70]:
main_dir = "/content/gdrive/MyDrive/Unicamp-aula-9"

## Libraries installation

In [71]:
!pip install transformers -q

In [72]:
!pip install jsonlines -q

In [73]:
!pip install evaluate -q

In [74]:
!pip install trectools -q

In [75]:
import random
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from statistics import mean, stdev

Random seeds definition, to enable replication of results.

In [76]:
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)

<torch._C.Generator at 0x7fc6781dcef0>

## Dataset processing

In [77]:
import jsonlines

id_to_text = dict()

with jsonlines.open(f"{main_dir}/trec-covid/corpus.jsonl") as reader:
  for item in reader:
    id = item["_id"]
    text = item["title"] + ' ' + item["text"]
    id_to_text[id] = text

In [78]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

def generate_training_data(dataset_file, model_name):
  dataset = []
  dataset_ids = []
  i = 0

  with open(dataset_file, 'r') as f:
    for line in f:
      data = json.loads(line)
      query = data["query"]
      positive_doc_id = data["positive_doc_id"]
      negative_doc_ids = data["negative_doc_ids"]
      
      #Chooses a random negative document
      negative_doc_id = random.choice(negative_doc_ids)

      #Gets the documents texts
      positive_doc = id_to_text[positive_doc_id]
      negative_doc = id_to_text[negative_doc_id]

      dataset.append((query, positive_doc, negative_doc))
      dataset_ids.append((i, positive_doc_id, negative_doc_id))

      i+=1

  df = pd.DataFrame(dataset, columns=['query', 'pos', 'neg'])

  df_pos = pd.DataFrame()
  df_neg = pd.DataFrame()

  for index, row in df.iterrows():
    if model_name == 'microsoft/MiniLM-L12-H384-uncased':
      df_pos = df_pos.append({"query":row[0], "passage":row[1], "score":1.0}, 
                             ignore_index=True)
      df_neg = df_neg.append({"query":row[0], "passage":row[2], "score":0.0}, 
                             ignore_index=True)
    elif model_name == 'cross-encoder/ms-marco-MiniLM-L-6-v2':
      df_pos = df_pos.append({"query":row[0], "passage":row[1], "score":True}, 
                             ignore_index=True)
      df_neg = df_neg.append({"query":row[0], "passage":row[2], "score":False}, 
                             ignore_index=True)

  print(model_name)
  print(df_pos.head())
  X_train_pos = df_pos.drop("score", axis=1)
  Y_train_pos = df_pos["score"]

  X_train_pos, X_val_pos, Y_train_pos, Y_val_pos = train_test_split(X_train_pos, 
                                                                    Y_train_pos, 
                                                                    test_size=0.1, 
                                                                    random_state=42)

  X_train_neg = df_neg.drop("score", axis=1)
  Y_train_neg = df_neg["score"]

  X_train_neg, X_val_neg, Y_train_neg, Y_val_neg = train_test_split(X_train_neg, 
                                                                    Y_train_neg, 
                                                                    test_size=0.1, 
                                                                    random_state=42)

  X_train = pd.concat([X_train_pos, X_train_neg], axis=0, ignore_index=True)
  Y_train = pd.concat([Y_train_pos, Y_train_neg], axis=0, ignore_index=True)
  X_val = pd.concat([X_val_pos, X_val_neg], axis=0, ignore_index=True)
  Y_val = pd.concat([Y_val_pos, Y_val_neg], axis=0, ignore_index=True)

  return X_train, Y_train, X_val, Y_val


### Training Loop

In [79]:
from tqdm import tqdm 

def evaluate(model, dataloader, set_name, model_name):
    losses = []
    correct = 0
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False):
            outputs = model(**batch.to(device))
            loss_val = outputs.loss
            losses.append(loss_val.cpu().item())
            preds = outputs.logits.argmax(dim=1)
            
            if model_name == 'cross-encoder/ms-marco-MiniLM-L-6-v2':
              preds  = (outputs.logits.view(-1)>0).float()
            
            correct += (preds == batch['labels']).sum().item()

    print(f'{set_name} loss: {mean(losses):0.3f}; {set_name} accuracy: {correct / len(dataloader.dataset):0.3f}')
    return correct / len(dataloader.dataset)
     

In [80]:
from torch import nn
from torch import optim
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [81]:
from transformers.optimization import get_constant_schedule
import os
import shutil

def train(model_name, epochs = 5, lr=5e-5):
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
  print('Parameters', model.num_parameters())

  optimizer = optim.AdamW(model.parameters(), lr)
  num_training_steps = epochs * len(dataloader_train)

  num_warmup_steps = int(num_training_steps * 0.1)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, 
                                              num_training_steps)
  #scheduler = get_constant_schedule(optimizer)

  evaluate(model=model, dataloader=dataloader_valid, set_name='Valid', 
           model_name=model_name)
  best_acc = 0

  # Training loop
  for epoch in tqdm(range(epochs), desc='Epochs'):
      model.train()
      train_losses = []
      for batch in tqdm(dataloader_train, mininterval=0.5, desc='Train', 
                        disable=False):
          optimizer.zero_grad()
          outputs = model(**batch.to(device))
          loss = outputs.loss
          loss.backward()
          optimizer.step()
          scheduler.step()
          train_losses.append(loss.cpu().item())

      print(f'Epoch: {epoch + 1} Training loss: {mean(train_losses):0.2f}')
      acc = evaluate(model=model, dataloader=dataloader_valid, set_name='Valid', 
                     model_name=model_name)
      #Saves the best checkpoint
      if acc > best_acc:
        best_acc = acc
        if os.path.exists(f'{MODELS_PATH}/best_checkpoint'):
          shutil.rmtree(f'{MODELS_PATH}/best_checkpoint')
        model.save_pretrained(f'{MODELS_PATH}/best_checkpoint')
  
  return model

In [82]:
MODELS_PATH = '/content/gdrive/MyDrive/Unicamp-aula-9'

### Reranking

Evaluation on TREC-COVID

In [83]:
id_to_query = dict()

with jsonlines.open(f"{main_dir}/trec-covid/queries.jsonl") as reader:
  for item in reader:
    id = item["_id"]
    text = item["text"]
    id_to_query[id] = text

In [84]:
import jsonlines

id_to_doc = dict()

with jsonlines.open(f"{main_dir}/trec-covid/corpus.jsonl") as reader:
  for item in reader:
    id = item["_id"]
    text = item["title"] + ' ' + item["text"]
    id_to_doc[id] = text

In [85]:
!head {main_dir}/trec-covid/run.trec-covid.bm25tuned.txt

1 Q0 dv9m19yk 1 4.158100 Anserini
1 Q0 kgifmjvb 2 3.338900 Anserini
1 Q0 wmfcey6f 3 3.338899 Anserini
1 Q0 safr9z37 4 3.220100 Anserini
1 Q0 0paafp5j 5 3.207300 Anserini
1 Q0 96zsd27n 6 3.207299 Anserini
1 Q0 4dtk1kyh 7 3.184800 Anserini
1 Q0 lhd0jn0z 8 2.903200 Anserini
1 Q0 55dihml5 9 2.899800 Anserini
1 Q0 qtx0d5f8 10 2.888800 Anserini


In [86]:
import pickle
import os

def tokenize_test_queries_and_passages():
  tokenized_queries = None
  tokenized_passages = None

  if os.path.exists(f"{main_dir}/trec-covid/tok_queries_test.pickle"):
    with open(f"{main_dir}/trec-covid/tok_queries_test.pickle", "rb") as f:
      print("Loading test queries...")
      tokenized_queries = pickle.load(f) 

  if os.path.exists(f"{main_dir}/trec-covid/tok_passages_test.pickle"):
    with open(f"{main_dir}/trec-covid/tok_passages_test.pickle", "rb") as f:
      print("Loading test passages...")
      tokenized_passages = pickle.load(f) 

  query_ids = []
  queries = []
  passage_ids = []
  passages = []

  with open(f'{main_dir}/trec-covid/run.trec-covid.bm25tuned.txt') as f:
    for line in f:
        fields = line.strip().split()
        query_id = fields[0]
        query_ids.append(query_id)
        passage_id = fields[2]
        passage_ids.append(passage_id)
        
        if not tokenized_queries:
          query_text = id_to_query[query_id]
          queries.append(query_text)

        if not tokenized_passages:
          passage_text = id_to_doc[passage_id]
          passages.append(passage_text)

  if not tokenized_queries:
    tokenized_queries = tokenizer(queries, max_length=max_length_query, truncation=True)

    with open(f"{main_dir}/trec-covid/tok_queries_test.pickle", 'wb') as f:
      pickle.dump(tokenized_queries, f)

  if not tokenized_passages:
    tokenized_passages = tokenizer(passages, max_length=max_length_passage, truncation=True)

    with open(f"{main_dir}/trec-covid/tok_passages_test.pickle", 'wb') as f:
      pickle.dump(tokenized_passages, f)

  return tokenized_queries, tokenized_passages, query_ids, passage_ids

In [87]:

def evaluate_test_dataset(model, dataloader, set_name, use_logits=False):
    scores = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False):
            outputs = model(**batch.to(device))
            if use_logits:
              # Usa os logits brutos
              pos_score = outputs.logits[:,1]
            else:
              # Usa os logits normalizados pelo softmax (por default)
              pos_score = torch.softmax(outputs.logits,1)[:,1]
            scores = scores + pos_score.tolist()
    return scores

In [88]:
def evaluate_ndcg_10(scores, model_name, eval_desc, query_ids, passage_ids):
  zipped_results = []

  #Por alguma razão misteriosa, o zip do Python não funcionou, deixou a lista 
  #vazia ou impossível de ser iterada.
  for i, query_id in enumerate(query_ids):
    zipped_results.append((query_id, passage_ids[i], scores[i]))

  #Quebra a lista em sublistas por query
  prev_query_id = -1
  sublists = []
  current_list = []

  for query_id, passage_id, score in zipped_results:
    if query_id != prev_query_id:
      if len(current_list) > 0:
        sublists.append(current_list)
        current_list = []
    current_list.append((query_id, passage_id, score))
    prev_query_id = query_id

  if len(current_list) > 0:
    sublists.append(current_list)

  # Ordena cada sublista
  sorted_list = []

  for sublist in sublists:
    sorted_sublist = sorted(sublist, key=lambda x: x[2], reverse=True)
    sorted_list += sorted_sublist

  # Gera o arquivo de run no formato TREC
  trec_run_file = f"{main_dir}/trec-covid/run.trec-covid.bert_reranked_{model_name.replace('/', '_')}_{eval_desc}.trec"
  with open(trec_run_file, "w") as f:
    for i, (query_id, passage_id, score) in enumerate(sorted_list):
      f.write(f'{query_id}\t{passage_id}\t{i+1}\t{score}\tbert_reranked_{model_name}\n')

  return trec_run_file

In [89]:
import pandas as pd

qrel = pd.read_csv(f"{main_dir}/trec-covid/test.tsv", sep="\t", header=None, 
                   skiprows=1, names=["query", "docid", "rel"])
qrel["q0"] = "q0"
qrel = qrel.to_dict(orient="list")

In [90]:
!head {main_dir}/trec-covid/test.tsv

query-id	corpus-id	score
1	005b2j4b	2
1	00fmeepz	1
1	g7dhmyyo	2
1	0194oljo	1
1	021q9884	1
1	02f0opkr	1
1	047xpt2c	0
1	04ftw7k9	0
1	pl9ht0d0	0


In [91]:
from evaluate import load

def eval_ndcg10(run):
  trec_eval = load("trec_eval")
  results = trec_eval.compute(predictions=[run], references=[qrel])
  return results['NDCG@10'] 

## Finetuning with cross-encoder/ms-marco-MiniLM-L-6-v2

In [92]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils import data
from transformers import BatchEncoding

In [93]:
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"


In [94]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [129]:
class MSMARCODataset(data.Dataset):
    def __init__(self, tokenizer, query, passages, targets, max_lenght=356, 
                 return_token_type_ids=False):
        self.tokenizer = tokenizer
        self.query = query
        self.passages = passages
        self.targets = targets
        self.max_lenght = max_lenght
        self.return_token_type_ids = return_token_type_ids
    
    def __len__(self):
        return len(self.query)

    def __getitem__(self, idx):
        instruction_token = self.tokenizer(self.query[idx],self.passages[idx],
                                           max_length=self.max_lenght, 
                                           truncation=True,
                                           padding="max_length", 
                                           return_tensors='pt',
                                           return_token_type_ids=self.return_token_type_ids)


        #token_type_ids
        if self.return_token_type_ids:
          return {'input_ids':torch.squeeze(instruction_token['input_ids']).long().to(device),\
                  'token_type_ids': torch.squeeze(instruction_token['token_type_ids']).long().to(device),\
                  'attention_mask':torch.squeeze(instruction_token['attention_mask']).long().to(device), \
                  'labels':torch.tensor(self.targets[idx], dtype=torch.float16)}
        else:
          return {'input_ids':torch.squeeze(instruction_token['input_ids']).long().to(device),\
                  'attention_mask':torch.squeeze(instruction_token['attention_mask']).long().to(device), \
                  'labels':torch.tensor(self.targets[idx], dtype=torch.float16)}
              
def collate_fn(batch):
    return BatchEncoding(tokenizer.pad(batch, return_tensors='pt'))
    

In [96]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [97]:
import pickle
import os

query_ids = []
queries = []
passage_ids = []
passages = []

with open(f'{main_dir}/trec-covid/run.trec-covid.bm25tuned.txt') as f:
  for line in f:
      fields = line.strip().split()
      query_id = fields[0]
      query_ids.append(query_id)
      passage_id = fields[2]
      passage_ids.append(passage_id)

      query_text = id_to_query[query_id]
      queries.append(query_text)

      passage_text = id_to_doc[passage_id]
      passages.append(passage_text)


In [130]:
dataset_test = MSMARCODataset(tokenizer, queries, passages, [1]*len(queries), return_token_type_ids=True)

In [131]:
dataloader_test = data.DataLoader(dataset_test, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [132]:

def evaluate_test_dataset2(model, dataloader, set_name):
  scores = []
  model.eval()
  with torch.no_grad():
    for batch in tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False):
      outputs = model(**batch.to(device))
      # Usa os logits brutos
      pos_score = outputs.logits[:,0]
      scores = scores + pos_score.tolist()
  return scores

## Finetuning with more datasets

In [133]:
import json
import glob

# Set the path of the directory containing JSONL files
directory_path = f"{main_dir}/trec-covid/datasets/"
print(directory_path)

# Define an empty list to store the concatenated data
data = []

# Loop through all the JSONL files in the directory
for filename in glob.glob(directory_path + "*.jsonl"):
  print(filename)
  with open(filename, "r") as file:
      # Read each line in the file and append to the data list
      for line in file:
          data.append(json.loads(line))

# Write the concatenated data to a new JSONL file
with open("concatenated.jsonl", "w") as outfile:
    for item in data:
        # Write each item as a JSON object on a separate line
        outfile.write(json.dumps(item) + "\n")


/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/monique_monteiro_1000_queries.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/eduseiti_100_queries_expansion_20230501_01.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/gustavo_1k_cohere.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/hugo_padovani_query_generation.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/juliatessler_1000_queries.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/leandro_carisio_01.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/leonardo_avila_queries_v1.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/leonardo_pacheco_1k_generated_queries_20230502.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/manoel_1k_generated_queries_20230430.jsonl
/content/gdrive/MyDrive/Unicamp-aula-9/trec-covid/datasets/manoel_2k_generated

In [105]:
!head concatenated.jsonl

{"query": "What is the most suitable protein for a diagnostic approach for Salmonella Enteritidis and why?", "positive_doc_id": "m1cmkkw3", "negative_doc_ids": ["0o3mryu1", "qpq7i1ya", "j5mkparg", "auoo0dm5", "dqfvrerw"]}
{"query": "What is Cryptosporidium parvum and why is it a major cause of disease in both humans and animals?", "positive_doc_id": "ukbl0svm", "negative_doc_ids": ["k3u2nvpe", "gc11fyms", "xoq9qblv", "20o4ufa3", "3huo5nf0"]}
{"query": "What is the role of the renin-angiotensin-aldosterone system in the context of SARS-CoV-2 infection?", "positive_doc_id": "12o4zey2", "negative_doc_ids": ["6gd6nwpu", "dt4t2wos", "8zwfkken", "sv7xpi4f", "6pf73z08"]}
{"query": "What are the functions of individual endolysosomal proteases in cellular processes such as autophagy and lipoprotein particle degradation?", "positive_doc_id": "eqv6a7tj", "negative_doc_ids": ["gmrty2uu", "uzn214j6", "032utjfh", "efet3ozc", "0muwl6oc"]}
{"query": "What is the prevalence of olfactory dysfunction in 

In [134]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [135]:
X_train, Y_train, X_val, Y_val = generate_training_data("concatenated.jsonl", model_name)

cross-encoder/ms-marco-MiniLM-L-6-v2
                                               query  \
0  What is the most suitable protein for a diagno...   
1  What is Cryptosporidium parvum and why is it a...   
2  What is the role of the renin-angiotensin-aldo...   
3  What are the functions of individual endolysos...   
4  What is the prevalence of olfactory dysfunctio...   

                                             passage  score  
0  Rapid identification of novel antigens of Salm...   True  
1  Cryptosporidium and host resistance: historica...   True  
2  Understanding the Renin-Angiotensin-Aldosteron...   True  
3  Specific functions of lysosomal proteases in e...   True  
4  Olfactory and rhinological evaluations in SARS...   True  


In [136]:
train_queries = list(X_train["query"])
train_passages = list(X_train["passage"])
val_queries = list(X_val["query"])
val_passages = list(X_val["passage"])

In [137]:
len(train_queries)

31808

In [138]:
len(val_queries)

3536

In [139]:
dataset_train = MSMARCODataset(tokenizer, train_queries, train_passages, Y_train)
assert len(dataset_train[0]['input_ids']) > 0
assert len(dataset_train[1]['attention_mask']) > 0

In [140]:
dataset_val = MSMARCODataset(tokenizer, val_queries, val_passages, Y_val)
assert len(dataset_val[0]['input_ids']) > 0
assert len(dataset_val[1]['attention_mask']) > 0

In [141]:
from torch.utils import data

In [142]:

dataloader_train = data.DataLoader(dataset_train, batch_size=32, shuffle=True, collate_fn=collate_fn)
dataloader_valid = data.DataLoader(dataset_val, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:
model = train(model_name, 10, lr=3e-5)

In [143]:
 #model = AutoModelForSequenceClassification.from_pretrained(f'{MODELS_PATH}/best_checkpoint').to(device)
 model = AutoModelForSequenceClassification.from_pretrained(f'{MODELS_PATH}/best_checkpoint-cross-encoder').to(device)

In [117]:
evaluate(model=model, dataloader=dataloader_valid, set_name='Valid', model_name=model_name)

Valid:   0%|          | 0/111 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Valid loss: 0.030; Valid accuracy: 0.889


0.8891402714932126

In [144]:
logit_scores = evaluate_test_dataset2(model=model, dataloader=dataloader_test, set_name='Test')

Test:   0%|          | 0/1563 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [145]:
trec_file_logits = evaluate_ndcg_10(logit_scores, model_name, "logits", query_ids, passage_ids)

In [146]:
import pandas as pd

run_trec_file_logits = pd.read_csv(trec_file_logits, sep="\t", header=None, 
                   skiprows=1, names=["query", "docid", "rank", "score", "system"])
run_trec_file_logits["q0"] = "q0"
run_trec_file_logits = run_trec_file_logits.to_dict(orient="list")

In [147]:
eval_ndcg10(run_trec_file_logits)

0.7022291382541727