In [1]:
from datasets import Dataset, load_dataset, load_from_disk
from pandas import read_pickle
import json
from re import compile

In [2]:
pattern_newline = compile(r'[\n\t\u200e]')  # Elimina saltos de línea, tabulaciones y caracteres no deseados
pattern_multiple_spaces = compile(r' +')  # Elimina múltiples espacios consecutivos

def simple_cleaning(query: str) -> str:
    cln_query = pattern_newline.sub(' ', query)
    cln_query = pattern_multiple_spaces.sub(' ', cln_query).strip()
    return cln_query

In [3]:
with open('./ObliQADataset/ObliQA_train.json') as f:
    data_train = json.load(f)
    
with open('./ObliQADataset/ObliQA_dev.json') as f:
    data_eval = json.load(f)
    
with open('./ObliQADataset/ObliQA_test.json') as f:
    data_test = json.load(f)   

In [4]:
len(data_train), len(data_eval), len(data_test)

(22295, 2788, 2786)

### Hard Negatives only for train data

In [5]:
hard_negatives = read_pickle('./data/retrieved_train_hard_negatives.pkl')

train_set = []
for q in data_train:
    q_id = q['QuestionID']
    for rel_doc in q['Passages']:
        train_set.append({
            'anchor_id': q_id,
            'anchor': simple_cleaning(q['Question']),
            'positive': simple_cleaning(f"{rel_doc['PassageID']} {rel_doc['Passage']}"),
            'positive_id': f"{rel_doc['DocumentID']}-{rel_doc['PassageID']}",
            'negative': simple_cleaning(hard_negatives[q_id][0]['text']),
            'negative_2': simple_cleaning(hard_negatives[q_id][1]['text']),
        })
        
train_dataset = Dataset.from_list(train_set)

train_dataset.save_to_disk('./data/train_dataset')

train_dataset

Saving the dataset (0/1 shards):   0%|          | 0/29547 [00:00<?, ? examples/s]

Dataset({
    features: ['anchor_id', 'anchor', 'positive', 'positive_id', 'negative', 'negative_2'],
    num_rows: 29547
})

### Eval & Test dataset

In [6]:
eval_set = []
for q in data_eval:
    q_id = q['QuestionID']
    for rel_doc in q['Passages']:
        eval_set.append({
            'anchor_id': q_id,
            'anchor': simple_cleaning(q['Question']),
            'positive': f"{rel_doc['PassageID']} {rel_doc['Passage']}",
            'positive_id': f"{rel_doc['DocumentID']}-{rel_doc['PassageID']}",
        })
        
eval_dataset = Dataset.from_list(eval_set)

eval_dataset.save_to_disk('./data/eval_dataset')

eval_dataset

Saving the dataset (0/1 shards):   0%|          | 0/3677 [00:00<?, ? examples/s]

Dataset({
    features: ['anchor_id', 'anchor', 'positive', 'positive_id'],
    num_rows: 3677
})

In [7]:
test_set = []
for q in data_test:
    q_id = q['QuestionID']
    for rel_doc in q['Passages']:
        test_set.append({
            'anchor_id': q_id,
            'anchor': simple_cleaning(q['Question']),
            'positive': f"{rel_doc['PassageID']} {rel_doc['Passage']}",
            'positive_id': f"{rel_doc['DocumentID']}-{rel_doc['PassageID']}",
        })
        
test_dataset = Dataset.from_list(test_set)

test_dataset.save_to_disk('./data/test_dataset')

test_dataset

Saving the dataset (0/1 shards):   0%|          | 0/3666 [00:00<?, ? examples/s]

Dataset({
    features: ['anchor_id', 'anchor', 'positive', 'positive_id'],
    num_rows: 3666
})

In [8]:
from sentence_transformers.evaluation import TripletEvaluator, InformationRetrievalEvaluator

In [9]:
TripletEvaluator(
    positives=train_dataset['positive'],
    anchors=train_dataset['anchor'],
    negatives=train_dataset['negative'],
    show_progress_bar=True,
)

<sentence_transformers.evaluation.TripletEvaluator.TripletEvaluator at 0x76d1e58033e0>