In [27]:
import sys
import os
import random
from itertools import product, chain
from collections import defaultdict

import tqdm
import pickle
from datasets import Dataset

sys.path.insert(0, '/home/ogalolu/thesis/pre-training-multilingual-document-encoders/clef')
from evaluate import  _get_rerank_dir
from clef_dataloaders.clef_dataloader import load_clef_rerank, load_relevance_assessments, load_clef

# Config

In [57]:
qlang = "en"
dlang = "en"
root_dir = "/work/ogalolu/datasets"
year = "2002"
rerank_dir = "/home/ogalolu/thesis/EncoderCLIR/src/saved_dictionary.pkl"

num_easy_negative = 2
num_hard_negative = 2
topk = 100  #10?

In [43]:
len(loaded_dict[96])

166754

In [39]:
with open(rerank_dir, 'rb') as f:
    loaded_dict = pickle.load(f)
    
qid2topk = {}
for qid, docs in loaded_dict.items():
    qid2topk[qid] = docs[:topk]
    
doc_ids, documents, query_ids, queries, relass = load_clef(qlang, dlang, year)
print("The number total documents:", len(documents))

The number total documents: 169477


# Initial Checks

In [41]:
s1 = set(relass.keys())
s2 = set(query_ids)
print("There are no relevance assesment for the following queries:", s2 - s1)

There are no relevance assesment for the following queries: {96, 132, 101, 110, 117, 118, 93, 127}


In [42]:
ids = list(chain.from_iterable(list(relass.values())))
print("Number of available query-document pairs:", len(ids))

Number of available query-document pairs: 821


# Filter Documents

In [44]:
# Some documents are very short
exclude_set = set()
for i, doc in enumerate(documents):
    if len(doc) < 100:
        exclude_set.add(i)
print("Number of documents that are excluded from dataset building:", len(exclude_set))

Number of documents that are excluded from dataset building: 2765


# Obtain Training Instances

In [69]:
def find_negative(d_ind_list: list, num_hard_negative: int, num_easy_negative: int, topk_indices: list) -> tuple:    
    """ Find hard and easy negative examples by excluding the indices of the relevant documents."""
    
    # To ensure that relevant documents won't apper in the set of hard negative examples
    topk_indices = set(topk_indices) - set(d_ind_list)
    hard_neg_ind_list = random.choices(tuple(topk_indices), k=num_hard_negative) 

    
    all_set = set(range(len(documents)))
    # To ensure that relevant documents and topk documents won't appear in the set of easy negative examples
    available_set = all_set - set(d_ind_list) - exclude_set - set(topk_indices)
    easy_neg_ind_list = random.choices(tuple(available_set), k=num_easy_negative)  
    negative_tuple = [documents[idx] for idx in hard_neg_ind_list + easy_neg_ind_list]
    
    return negative_tuple

In [70]:
instance_list = list()
for q_ind, d_ind in tqdm.tqdm(relass.items()):
    temp_query = [queries[query_ids.index(q_ind)]]
    temp_documents = [documents[doc_ids.index(idx)] for idx in d_ind]
    temp_list = list(product(temp_query, temp_documents))  # list of tuples
    
    # if negative examples to be added, then for each training instance that has the same query, create a 
    # different set of (easy and hard) negative examples, the number of negative examples is set at the beginning.
    
    if num_hard_negative is not None:
        # relevant documents (according to the human readers - ground truth)
        d_ind_list = [doc_ids.index(idx) for idx in d_ind]  
        # topk documents (according to the pre-ranker)
        topk_indices = [doc_ids.index(idx) for idx in qid2topk[q_ind]]
        
        for q in temp_list:
            temp_negatives = find_negative(d_ind_list, num_hard_negative, num_easy_negative, topk_indices)
            temp_tuple = (*q, *temp_negatives)
            instance_list.append(temp_tuple)
    else:
        instance_list.extend(temp_list)

assert len(instance_list) == len(ids)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:15<00:00,  2.77it/s]


In [71]:
instance_dic = defaultdict(list)
for q, d, *negatives in instance_list:
    # For the naming convengtion, "article" is used, whereas article_1 and article_2 are reserved for query and document
    instance_dic["article_1"].append(q)
    instance_dic["article_2"].append(d)
    
    # If negatives to be added    
    if num_hard_negative is not None:
        for idx, neg in enumerate(negatives, 3):
            instance_dic[f"article_{idx}"].append(neg)

# Create Dataset

In [72]:
dataset = Dataset.from_dict(instance_dic)
print(dataset[0]["article_4"])

 THE MELDING AMERICAS; POLICY; U.S. TRYING TO REFOCUS ITS RELATIONS WITH  NEIGHBORS IN POST-COLD WAR ERA; WHILE TRADE AND POLITICAL CONDITIONS ARE  FAVORABLE, CRISES SUCH AS CUBA AND HAITI GET IN THE WAY OF PROGRESS.     When the Senate Foreign Relations Committee was considering the confirmation  of Alexander F. Watson as the State Department's key official on Latin America  last year, all of the questions posed by lawmakers concerned just three  countries -- Cuba, Haiti and Nicaragua -- near flyspecks when laid against the  expanse of the Western Hemisphere.     Watson, assistant secretary of state for Latin American affairs, later  recalled that he joked to South American journalists that the senators had  agreed not to ask about any country with a population of more than 15 million.     With the end of the Cold War, which had skewed U.S. relations with Latin  America for most of the last half-century, the United States is trying to  refocus its approach to its closest neighbors on 

In [73]:
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article_1', 'article_2', 'article_3', 'article_4', 'article_5', 'article_6'],
        num_rows: 738
    })
    test: Dataset({
        features: ['article_1', 'article_2', 'article_3', 'article_4', 'article_5', 'article_6'],
        num_rows: 83
    })
})


In [74]:
path = os.path.join(root_dir, f"clef_{year}_{topk}_{num_hard_negative}_{num_easy_negative}")
dataset.save_to_disk(path)