In [28]:
import sys
import os
import random
from itertools import product, chain
from collections import defaultdict

import tqdm
from datasets import Dataset

sys.path.insert(0, '/home/ogalolu/thesis/pre-training-multilingual-document-encoders/clef')
from evaluate import  _get_rerank_dir
from clef_dataloaders.clef_dataloader import load_clef_rerank, load_relevance_assessments, load_clef

# Config

In [64]:
qlang = "en"
dlang = "en"
root_dir = "/work/ogalolu/datasets"
year = "2002"
num_negative = 32

In [65]:
doc_ids, documents, query_ids, queries, relass = load_clef(qlang, dlang, year)
print("The number total documents:", len(documents))

The number total documents: 169477


# Initial Checks

In [66]:
s1 = set(relass.keys())
s2 = set(query_ids)
print("There are no relevance assesment for the following queries:", s2 - s1)

There are no relevance assesment for the following queries: {96, 132, 101, 110, 117, 118, 93, 127}


In [67]:
ids = list(chain.from_iterable(list(relass.values())))
print("Number of available query-document pairs:", len(ids))

Number of available query-document pairs: 821


# Filter Documents

In [68]:
# Some documents are very short
exclude_set = set()
for i, doc in enumerate(documents):
    if len(doc) < 100:
        exclude_set.add(i)
print("Number of documents that are excluded from dataset building:", len(exclude_set))

Number of documents that are excluded from dataset building: 2765


# Obtain Training Instances

In [69]:
def find_negative(d_ind_list: list, num_negative: int) -> tuple:    
    """ Find negative examples by excluding the given list of document indices."""
    all_set = set(range(len(documents)))
    # To ensure that relevant documents won't appear in the set of negative examples
    available_set = all_set - set(d_ind_list) - exclude_set
    neg_ind_list = random.choices(tuple(available_set), k=num_negative)  
    assert not any(x in neg_ind_list for x in d_ind_list)
    negative_tuple = [documents[idx] for idx in neg_ind_list]
    
    return negative_tuple

In [None]:
instance_list = list()
for q_ind, d_ind in tqdm.tqdm(relass.items()):
    temp_query = [queries[query_ids.index(q_ind)]]
    temp_documents = [documents[doc_ids.index(idx)] for idx in d_ind]
    temp_list = list(product(temp_query, temp_documents))  # list of tuples
    
    # if negative examples to be added, then for each training instance that has the same query, create a 
    # different set of negative examples, the number of negative examples is set at the beginning
    
    if num_negative is not None:
        d_ind_list = [doc_ids.index(idx) for idx in d_ind]  
        for q in temp_list:
            temp_negatives = find_negative(d_ind_list, num_negative)
            temp_tuple = (*q, *temp_negatives)
            instance_list.append(temp_tuple)
    else:
        instance_list.extend(temp_list)

assert len(instance_list) == len(ids)

 38%|█████████████████████████████████████████████████████████████████▉                                                                                                           | 16/42 [00:05<00:09,  2.82it/s]

In [None]:
instance_dic = defaultdict(list)
for q, d, *negatives in instance_list:
    # For the naming convengtion, "article" is used, whereas article_1 and article_2 are reserved for query and document
    instance_dic["article_1"].append(q)
    instance_dic["article_2"].append(d)
    
    # If negatives to be added    
    if num_negative is not None:
        for idx, neg in enumerate(negatives, 3):
            instance_dic[f"article_{idx}"].append(neg)

# Create Dataset

In [None]:
dataset = Dataset.from_dict(instance_dic)
print(dataset[0]["article_18"])

In [None]:
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

In [None]:
path = os.path.join(root_dir, f"clef_{year}_{num_negative}")
dataset.save_to_disk(path)