### Calculating cross-encoder relevance scores

In [1]:
import pandas as pd
import ast
path_my = "data/adaptive_rag_2wikimultihopqa/test.csv"
data_my = pd.read_csv(path_my)

In [2]:
corpus = list(data_my['retrieved_contexts_oner_qa'].values)
retrieved_docs = [ast.literal_eval(doc) for doc in corpus]
queries = list(data_my['question_text'].values)

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
cross_encoder_tokenizer =  BertTokenizer.from_pretrained('bert-base-uncased')
cross_encoder_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import numpy as np

# Load tokenizer and model
cross_encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
cross_encoder_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

def cross_encode_batch(queries, docs, model, tokenizer):
    """
    Compute relevance scores in batch for multiple (query, document) pairs.

    Args:
        queries: List of queries (one query per document in docs).
        docs: List of documents.
        model: Cross-encoder model.
        tokenizer: Tokenizer for encoding input.
    
    Returns:
        List of relevance scores.
    """
    # Tokenize in batch
    inputs = tokenizer(
        queries, docs, 
        return_tensors="pt", 
        truncation=True, 
        padding=True
    )

    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform inference
    with torch.no_grad():
        logits = model(**inputs).logits  # Shape: (batch_size, 2)

    # Compute relevance scores (softmax on class 1)
    relevance_scores = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
    
    return relevance_scores

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Process all queries and retrieved documents in batch
relevance_scores = []

for query, docs in tqdm(zip(queries, retrieved_docs), total=len(queries)):
    batch_queries = [query] * len(docs)  # Duplicate query for each document
    scores = cross_encode_batch(batch_queries, docs, cross_encoder_model, cross_encoder_tokenizer)
    relevance_scores.append(scores)  # Append batch scores

100%|██████████| 500/500 [00:33<00:00, 14.83it/s]


In [6]:
for i in tqdm(range(len(data_my))):
    
    data_my.loc[i, 'context_relevance_min'] = np.min(relevance_scores[i])
    data_my.loc[i, 'context_relevance_max'] = np.max(relevance_scores[i])
    data_my.loc[i, 'context_relevance_mean'] = np.mean(relevance_scores[i])
    data_my.loc[i, 'context_length'] = len(data_my.loc[i, 'retrieved_contexts_oner_qa'])

100%|██████████| 500/500 [00:00<00:00, 3251.64it/s]


In [7]:
data_my.to_csv(path_my, index = False)

In [112]:
print(np.mean(data_my['context_relevance_mean']))
print(np.std(data_my['context_relevance_mean']))
print(np.min(data_my['context_relevance_mean']))
print(np.max(data_my['context_relevance_mean']))

0.3701108673810959
0.03696990921758264
0.3339402973651886
0.620703935623169
