In [None]:
from datasets import load_dataset
import os

In [None]:
data = load_dataset("json", data_dir="data/train_generic/msmarco_squadv2_splitted",
                    data_files=[file for file in os.listdir("data/train_generic/msmarco_squadv2_splitted") if file.endswith(".jsonl")],
                    split="train", num_proc=32, cache_dir="./.cache")

In [None]:
data[0]['pos']

In [None]:
data = load_dataset("json", data_dir="data/train_generic/structured_data_doc_splitted",
                    data_files=[file for file in os.listdir("data/train_generic/structured_data_doc_splitted") if file.endswith(".jsonl")],
                    split="train", num_proc=32, cache_dir="./.cache")

In [None]:
data

In [None]:
data = load_dataset("json", data_dir="data/train_generic/newssapo_splitted",
                    data_files=[file for file in os.listdir("data/train_generic/newssapo_splitted") if file.endswith(".jsonl")],
                    split="train", num_proc=32, cache_dir="./.cache")

In [None]:
data[0]

In [None]:
data_folders = ["data/train_generic/msmarco_squadv2_splitted"]

for data_folder in data_folders:
    for file in sorted(os.listdir(data_folder))[2:]:
        if file.endswith(".jsonl"):
            data_file = os.path.join(data_folder, file)
            print(data_file)
            dataset = load_dataset("json", data_files=[data_file],
                                   split="train", num_proc=32, cache_dir="./.cache").select_columns(["query", "pos"])
            dataset.to_json(data_file, orient="records", lines=True, force_ascii=False)

In [None]:
import datasets 
from datasets import load_dataset

train_group_size = 1

In [None]:
if train_group_size == 2:
    context_feat = datasets.Features({
        'query': datasets.Value('string'),
        'pos': datasets.Sequence(datasets.Value('string'))
    })
else:
    context_feat = datasets.Features({
        'query': datasets.Value('string'),
        'pos': datasets.Sequence(datasets.Value('string')),
        'neg': datasets.Sequence(datasets.Value('string'))
    })
context_feat_kd = datasets.Features({
    'query': datasets.Value('string'),
    'pos': datasets.Sequence(datasets.Value('string')),
    'neg': datasets.Sequence(datasets.Value('string')),
    'pos_scores': datasets.Sequence(datasets.Value('float')),
    'neg_scores': datasets.Sequence(datasets.Value('float')),
})

In [None]:
train_data = ["data/train_generic/newssapo_splitted"]

for data_dir in train_data:
    if not os.path.isdir(data_dir):
        raise FileNotFoundError(f"{data_dir} is a file, not a directionary")

    # Add `parallel_` in `data_dir` to indicate that this dataset is parallel corpus
    flag = 'parallel_' in data_dir
    for file in os.listdir(data_dir):
        if not (file.endswith('.json') or file.endswith('.jsonl')):
            continue
        
        file_path = os.path.join(data_dir, file)
        try:
            temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', features=context_feat)
        except:
            temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', features=context_feat_kd)

In [None]:
import numpy as np
from datasets import Dataset
import json
from tqdm import tqdm
from multiprocessing import Pool
from rank_bm25 import BM25Plus
from BGE_M3.process_data.utils import bm25_tokenizer

def convert_score_to_rank(scores):
    sorted_indices = np.argsort(-scores)
    ranked_indices = np.argsort(sorted_indices) + 1
    return ranked_indices

dev_queries = json.load(open("data/eval/data/dev_queries.json", encoding='utf-8'))
formatted_dev_queries = {"id": list(dev_queries.keys()), "text": list(dev_queries.values())}
dev_queries_dataset = Dataset.from_dict(formatted_dev_queries)

corpus = json.load(open("data/eval/data/filtered_corpus.json", encoding='utf-8'))
bm25 = BM25Plus(corpus=list(corpus.values()),
                tokenizer=bm25_tokenizer,
                k1=0.4,
                b=0.6)

In [None]:
def calculate_ranks(sample):
    scores = bm25.get_scores(bm25_tokenizer(sample['text']))
    ranks = convert_score_to_rank(scores)
    return {"ranks": ranks}

dev_queries_ranks = dev_queries_dataset.map(calculate_ranks, num_proc=32)

In [None]:
import pickle
from BGE_M3.process_data.utils import bm25_tokenizer
from rank_bm25 import BM25Plus
from joblib import Parallel, delayed
from multiprocessing import Pool
from tqdm import tqdm
import numpy as np
import os
import json

print("Loading BM25 model ...")
with open("saved_models/bm25result_1.25_0.9", 'rb') as bm25result_file:
    bm25 = pickle.load(bm25result_file)
    
print("Loading dev queries and corpus ...")
corpus = json.load(open("data/eval/law/filtered_corpus.json", encoding='utf-8'))
dev_queries = json.load(open("data/eval/law/dev_queries.json", encoding='utf-8'))

In [None]:
def convert_score_to_rank(scores):
    sorted_indices = np.argsort(-scores)
    ranked_indices = np.argsort(sorted_indices) + 1
    return ranked_indices

def calculate_ranks(sample):
    scores = bm25.get_scores(bm25_tokenizer(sample))
    ranks = convert_score_to_rank(scores)
    return ranks
print("Calculate bm25 ranks ...")
    
with ProcessPoolExecutor(max_workers=32) as executor:
    sparse_ranks = np.array(list(executor.map(calculate_ranks, dev_queries.values())))

In [None]:
import torch
import numpy as np
from sentence_transformers.util import semantic_search

queries_embeddings = torch.rand(9992, 1024)
corpus_embeddings = torch.rand(240000, 1024)

# queries_embeddings = np.random.default_rng().standard_normal(size=(9992, 1024), dtype='float32')
# corpus_embeddings = np.random.default_rng().standard_normal(size=(240000, 1024), dtype='float32')

results_dense_search = semantic_search(queries_embeddings, corpus_embeddings,
                                       top_k=100)

In [None]:
from BGE_M3.src.utils import BGEM3FlagModel

model = BGEM3FlagModel(
    model_name_or_path="/home/admin_mcn/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/babcf60cae0a1f438d7ade582983d4ba462303c2",
    pooling_method="cls",
    use_fp16=False,
    device="cpu"
)

sentences_1 = ["What is panda?", "What is BGE M3?", "Defination of BM25", "What is a codebase (code base)?", "What is a large language model (LLM)?"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document",
               "BM25 is a ranking function used by search engines to estimate the relevance of documents to a given search query.",
               "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
               "A codebase is the complete body of source code for a given software program, component or application system.",
               "Large language models (LLMs) are machine learning models that can comprehend and generate human language text."]

query_embeddings = model.encode(sentences_1)
passage_embeddings = model.encode(sentences_2)

In [None]:
import numpy as np
import torch

scores1 = torch.rand(9992, 224000).cpu().numpy()
scores2 = torch.rand(9992, 224000).cpu().numpy()

In [None]:
from BGE_M3.eval.utils import ranks_from_scores_parallel

ranks1 = ranks_from_scores_parallel(scores1, num_workers=32)
ranks2 = ranks_from_scores_parallel(scores2, num_workers=32)

In [None]:
import numpy as np
from BGE_M3.src.utils import rrf_from_scores
from joblib import Parallel, delayed

def parallel_calculate_rrf(scores1, scores2, k=60, num_workers=4):
    Nq, Np = scores1.shape
    
    # Split the queries into chunks
    print("splitting chunks")
    chunk_size = 100
    chunks = [(scores1[i:i + chunk_size], scores2[i:i + chunk_size]) 
              for i in range(0, Nq, chunk_size)]
    
    # Process each chunk in parallel using multiprocessing Pool
    print("processing chunks")
    with Parallel(n_jobs=num_workers, verbose=10) as parallel:
        results = parallel(delayed(rrf_from_scores)(scores1_chunk, scores2_chunk, k) 
                           for scores1_chunk, scores2_chunk in chunks)
    
    # Concatenate the results
    print("concatenating results")
    rrf_scores = np.zeros((Nq, Np))
    for i, chunk in enumerate(results):
        rrf_scores[i*chunk_size:(i+1)*chunk_size] = chunk
    
    return rrf_scores

# Example usage
scores1_test = scores1
scores2_test = scores2

k = 10
num_workers = 8  # Adjust the number of workers according to your system's capabilities
rrf_scores = parallel_calculate_rrf(scores1_test, scores2_test, k, num_workers)

In [None]:
from BGE_M3.src.utils import rrf_from_scores_chunk, rrf_from_scores_parallel

In [None]:
scores_chunk = rrf_from_scores_chunk(
    scores1, scores2, k=10, bm25_weight=0.1, chunk_size=100
)

In [None]:
scores = rrf_from_scores_parallel(scores1, scores2, 10, 0.1,
                                  tool="joblib", num_workers=16)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.delete_file(
    path_in_repo="law_translated-10-of-112.json",
    repo_id="nntoan209/law_translated",
    repo_type="dataset"
)

api.upload_folder(
    repo_id="nntoan209/bgem3-vi-e3",
    repo_type="model",
    folder_path="saved_models/vi/bgem3_vi_20240618/checkpoint-25638",
    ignore_patterns="global_step25638*"
)

api.upload_folder(
    repo_id="nntoan209/TrainData-CrossLingual",
    repo_type="dataset",
    folder_path="data/cross_lingual"
)

In [None]:
from torch.utils.data import Dataset
import datasets
import math
from pprint import pprint
import random
from dataclasses import dataclass
from transformers import DataCollatorWithPadding
from easydict import EasyDict

class LawsTrainDatasetCrossLingual(Dataset):
    """Dataset to yield a batch of data at one time. All samples in the same batch comes from the same task.
    """
    def __init__(self, args: EasyDict, batch_size: int, seed: int, process_index: int=0, num_processes: int=1):
        
        train_datasets = []
        each_data_inxs = []
        batch_size_inxs = []
        pqloss_flag = []
        cur_all_num = 0
        
        SMALL_THRESHOLD = args.small_threshold
        DROP_THRESHOLD = args.drop_threshold
        
        if args.train_group_size == 1:
            context_feat = datasets.Features({
                'query': datasets.Value('string'),
                'pos': datasets.Sequence(datasets.Value('string'))
            })
        else:
            context_feat = datasets.Features({
                'query': datasets.Value('string'),
                'pos': datasets.Sequence(datasets.Value('string')),
                'neg': datasets.Sequence(datasets.Value('string'))
            })
        context_feat_kd = datasets.Features({
            'query': datasets.Value('string'),
            'pos': datasets.Sequence(datasets.Value('string')),
            'neg': datasets.Sequence(datasets.Value('string')),
            'pos_scores': datasets.Sequence(datasets.Value('float')),
            'neg_scores': datasets.Sequence(datasets.Value('float')),
        })
        assert isinstance(args.train_data, list) and len(args.train_data) >= 1
        
        self.print_batch_size(batch_size=batch_size, train_group_size=args.train_group_size)
        
        for data_dir in args.train_data:
            if not os.path.isdir(data_dir):
                raise FileNotFoundError(f"{data_dir} is a file, not a directionary")
            
            small_datasets = []
            small_batch_size = math.inf
            
            # Add `parallel_` in `data_dir` to indicate that this dataset is parallel corpus
            flag = 'parallel_' in data_dir
            for file in os.listdir(data_dir):
                if not (file.endswith('.json') or file.endswith('.jsonl')):
                    continue
                
                file_path = os.path.join(data_dir, file)
                print(f'loading data from {file_path} ...')
                try:
                    temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=args.cache_path, features=context_feat)
                except:
                    temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=args.cache_path, features=context_feat_kd)
                    if not args.knowledge_distillation:
                        temp_dataset = temp_dataset.remove_columns(['pos_scores', 'neg_scores'])
                
                if len(temp_dataset) == 0:
                    continue
                elif len(temp_dataset) < SMALL_THRESHOLD:
                    small_datasets.append(temp_dataset)
                    small_batch_size = min(small_batch_size, self.get_file_batch_size(file, batch_size, train_group_size=args.train_group_size))
                else:
                    if args.max_example_num_per_dataset is not None and len(temp_dataset) > args.max_example_num_per_dataset:
                        temp_dataset = temp_dataset.select(
                            random.sample(list(range(len(temp_dataset))), args.max_example_num_per_dataset))
                    train_datasets.append(temp_dataset)
                    each_data_inxs.append(np.arange(len(temp_dataset)) + cur_all_num)
                    cur_all_num += len(temp_dataset)
                    batch_size_inxs.append(self.get_file_batch_size(file, batch_size, train_group_size=args.train_group_size))
                    pqloss_flag.append(flag)
            
            if len(small_datasets) > 0:
                small_dataset = datasets.concatenate_datasets(small_datasets)
                if len(small_dataset) >= DROP_THRESHOLD:
                    train_datasets.append(small_dataset)
                    each_data_inxs.append(np.arange(len(small_dataset)) + cur_all_num)
                    cur_all_num += len(small_dataset)
                    batch_size_inxs.append(small_batch_size)
                    pqloss_flag.append(flag)
        
        self.dataset = datasets.concatenate_datasets(train_datasets)
        self.each_data_inxs = each_data_inxs
        self.datasets_inxs = np.arange(len(each_data_inxs))
        self.batch_size_inxs = batch_size_inxs
        self.pqloss_flag = pqloss_flag
        
        self.process_index = process_index
        self.num_processes = num_processes
        self.args = args
        self.shuffle_ratio = args.shuffle_ratio
        
        self.deterministic_generator = np.random.default_rng(seed)
        self.step = 0
        self.refresh_epoch()
    
    def print_batch_size(self, batch_size: int, train_group_size: int):
        length_list = ['0-512', '512-1024', '1024-2048', '2048-inf']
        batch_size_dict = {
            k: self.get_file_batch_size(f"len-{k}.jsonl", batch_size, train_group_size) for k in length_list
        }
        batch_size_list = [
            f'{length}: {batch_size_dict[length]}' for length in length_list
        ]
        print("=========================")
        print("Batch Size Dict:")
        pprint(batch_size_list)
        print("=========================")
    
    @staticmethod
    def get_file_batch_size(file: str, batch_size: int, train_group_size: int):
        if train_group_size == 3:
            # 24GB
            if 'len-0-512.jsonl' in file:
                return 60
            elif 'len-512-1024.jsonl' in file:
                return 44
            elif 'len-1024-2048.jsonl' in file:
                return 24
            elif 'len-2048-inf.jsonl' in file:
                return 24
            else:
                return batch_size
        elif train_group_size == 2:
            # 24GB
            if 'len-0-512.jsonl' in file:
                return 88
            elif 'len-512-1024.jsonl' in file:
                return 58
            elif 'len-1024-2048.jsonl' in file:
                return 24
            elif 'len-2048-inf.jsonl' in file:
                return 16
            else:
                return batch_size
        else:
            return batch_size
        
                
    def refresh_epoch(self):
        print(f'---------------------------*Rank {self.process_index}: refresh data---------------------------')
        self.deterministic_generator.shuffle(self.datasets_inxs)
        # Dynamically adjust batch size
        batch_datas = []
        for dataset_inx in self.datasets_inxs:
            self.deterministic_generator.shuffle(self.each_data_inxs[dataset_inx])
            cur_batch_size = self.batch_size_inxs[dataset_inx]*self.num_processes
            flag = self.pqloss_flag[dataset_inx]
            for start_index in range(0, len(self.each_data_inxs[dataset_inx]), cur_batch_size):
                # judge the last batch's length
                if len(self.each_data_inxs[dataset_inx]) - start_index < 2 * self.num_processes:
                    break
                batch_datas.append((self.each_data_inxs[dataset_inx][start_index:start_index+cur_batch_size], flag))
        self.deterministic_generator.shuffle(batch_datas)
        self.batch_datas = batch_datas
        self.step = 0

    def __getitem__(self, _):  
        batch_indices, pqloss_flag = self.batch_datas[self.step]
        cur_batch_size = int(len(batch_indices) / self.num_processes)
        batch_indices = batch_indices[self.process_index * cur_batch_size: (self.process_index + 1) * cur_batch_size]
        batch_data = self.dataset[batch_indices]
        self.step += 1
        queries, passages, teacher_scores = self.create_batch_data(batch_raw_data=batch_data)
        # print('rank, step, flag, query, passage:', dist.get_rank(), self.step, pqloss_flag, queries, passages)
        return queries, passages, teacher_scores, pqloss_flag

    def shuffle_text(self, text):
        if self.shuffle_ratio > 0 and len(text) > 100 and random.random() < self.shuffle_ratio:
            split_text = []
            chunk_size = len(text)//3 + 1
            for i in range(0, len(text), chunk_size):
                split_text.append(text[i:i+chunk_size])
            random.shuffle(split_text)
            return " ".join(split_text)
        else:
            return text

    def create_batch_data(self, batch_raw_data):
        queries, passages = [], []
        teacher_scores = []
        for i in range(len(batch_raw_data['query'])):            
            queries.append(batch_raw_data['query'][i])
            
            pos_inx = random.choice(list(range(len(batch_raw_data['pos'][i]))))
            passages.append(self.shuffle_text(batch_raw_data['pos'][i][pos_inx]))
            if 'pos_scores' in batch_raw_data and batch_raw_data['pos_scores'][i] is not None:
                teacher_scores.append(batch_raw_data['pos_scores'][i][pos_inx])
            
            if self.args.train_group_size > 1:
                neg_inx_set = list(range(len(batch_raw_data['neg'][i])))
                if len(batch_raw_data['neg'][i]) < self.args.train_group_size - 1:
                    num = math.ceil((self.args.train_group_size - 1) / len(batch_raw_data['neg'][i]))
                    neg_inxs = random.sample(neg_inx_set * num, self.args.train_group_size - 1)
                else:
                    neg_inxs = random.sample(neg_inx_set, self.args.train_group_size - 1)            
                
                if 'neg_scores' in batch_raw_data and batch_raw_data['neg_scores'][i] is not None:
                    neg_scores = [(x, batch_raw_data['neg_scores'][i][x]) for x in neg_inxs]
                    neg_scores = sorted(neg_scores, key=lambda x:x[1], reverse=True)
                    neg_inxs = [x[0] for x in neg_scores]
                    teacher_scores.extend([x[1] for x in neg_scores])
                    
                negs = [batch_raw_data['neg'][i][x] for x in neg_inxs]
                passages.extend(negs)
            
            if len(teacher_scores) > 0 and len(passages) > 0:
                assert len(teacher_scores) == len(passages)

        if self.args.query_instruction_for_retrieval is not None:
            queries = [self.args.query_instruction_for_retrieval+q for q in queries]
        if self.args.passage_instruction_for_retrieval is not None:
            passages = [self.args.passage_instruction_for_retrieval+p for p in passages]
        
        if len(teacher_scores) == 0:
            teacher_scores = None
        return queries, passages, teacher_scores
    
    def __len__(self):
        return len(self.batch_datas) * self.num_processes


@dataclass
class EmbedCollatorCrossLingual(DataCollatorWithPadding):
    """
    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
    and pass batch separately to the actual collator.
    Abstract out data detail for the model.
    """
    query_max_len: int = 32
    passage_max_len: int = 128
    merged_queries_vi: dict = None
    merged_corpus_vi: dict = None
    merged_queries_en: dict = None
    merged_corpus_en: dict = None

    def __call__(self, features):
        query = [f[0] for f in features]
        passage = [f[1] for f in features]
        
        teacher_scores = None
        if len(features[0]) > 2:
            teacher_scores = [f[2] for f in features]
            if teacher_scores[0] is None:
                teacher_scores = None
            else:
                teacher_scores = torch.FloatTensor(teacher_scores)
        
        flag = None
        if len(features[0]) == 4:
            flag = [f[3] for f in features][0]
            
        if isinstance(query[0], list):
            query = sum(query, [])
        if isinstance(passage[0], list):
            passage = sum(passage, [])
            
        # random between vietnamese and english
        # query = [self.merged_queries_vi[q] if random.random() <= 0.5 else self.merged_queries_en[q] for q in query]
        # passage = [self.merged_corpus_vi[p] if random.random() <= 0.5 else self.merged_corpus_en[p] for p in passage]
        query = [self.merged_queries_vi[q] for q in query]
        passage = [self.merged_corpus_vi[p] for p in passage]

        q_collated = self.tokenizer(
            query,
            # padding='max_length',     # used for adjusting the batch size in `get_file_batch_size()`
            padding=True,
            truncation=True,
            max_length=self.query_max_len,
            return_tensors="pt",
        )
        d_collated = self.tokenizer(
            passage,
            # padding='max_length',     # used for adjusting the batch size in `get_file_batch_size()`
            padding=True,
            truncation=True,
            max_length=self.passage_max_len,
            return_tensors="pt",
        )
        if teacher_scores is not None:
            teacher_scores = teacher_scores.reshape((len(q_collated['input_ids']), -1))
        return {"query": query, "passage": passage}
        return {"query": q_collated, "passage": d_collated, "teacher_scores": teacher_scores, "bi_directions": flag}

In [None]:
from transformers import AutoTokenizer
import os
import json
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

data_args = EasyDict()
data_args.small_threshold = 0
data_args.drop_threshold = 0
data_args.train_group_size = 2
data_args.train_data = ["data/cross_lingual/merged_train_data_splitted"]
data_args.knowledge_distillation = False
data_args.cache_path = "./.cache"
data_args.max_example_num_per_dataset = None
data_args.shuffle_ratio = 0
data_args.query_instruction_for_retrieval = None
data_args.passage_instruction_for_retrieval = None
data_args.merged_queries_vi = "data/cross_lingual/merged_queries_vi.json"
data_args.merged_queries_en = "data/cross_lingual/merged_queries_en.json"
data_args.merged_corpus_vi = "data/cross_lingual/merged_corpus_vi.json"
data_args.merged_corpus_en = "data/cross_lingual/merged_corpus_en.json"


merged_queries_vi = json.load(open(data_args.merged_queries_vi, encoding='utf-8'))
merged_corpus_vi = json.load(open(data_args.merged_corpus_vi, encoding='utf-8'))
merged_queries_en = json.load(open(data_args.merged_queries_en, encoding='utf-8'))
merged_corpus_en = json.load(open(data_args.merged_corpus_en, encoding='utf-8'))

train_dataset = LawsTrainDatasetCrossLingual(args=data_args, 
                                            batch_size=1, 
                                            seed=1234, 
                                            num_processes=1,
                                            process_index=0)

data_collator = EmbedCollatorCrossLingual(
        tokenizer,
        merged_queries_vi=merged_queries_vi,
        merged_corpus_vi=merged_corpus_vi,
        merged_queries_en=merged_queries_en,
        merged_corpus_en=merged_corpus_en,
        query_max_len=128,
        passage_max_len=2048
    )

In [None]:
from torch.utils.data import DataLoader

data_loader = DataLoader(train_dataset, collate_fn=data_collator, drop_last=True)

In [None]:
len(train_dataset)

In [None]:
for i, batch in enumerate(data_loader):
    a = batch
    break

In [None]:
len(a['query']), len(a['passage'])

In [None]:
a

### Sequence length

In [None]:
from datasets import load_dataset, Dataset
import json
import numpy as np
import os
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

In [None]:
tvpl_corpus = json.load(open("data/eval/zalo_qa/filtered_corpus.json", encoding='utf-8'))
tvpl_corpus_dict = {"id": list(tvpl_corpus.keys()), "text": list(tvpl_corpus.values())}
tvpl_corpus_dataset = Dataset.from_dict(tvpl_corpus_dict)
tvpl_corpus_dataset

In [None]:
tvpl_corpus_dataset = tvpl_corpus_dataset.map(lambda x: {"length": len(tokenizer(x['text'])['input_ids'])}, num_proc=32)

In [None]:
len(tvpl_corpus_dataset.filter(lambda x: 1024 <= x['length'] < 2048, num_proc=32)) / len(tvpl_corpus_dataset) * 100

### Training and Evaluation samples

In [6]:
import json
import os

In [3]:
train_sample_count = {}

with open("../data/cross_lingual/merged_train_data_ids.jsonl") as fIn:
    for line in fIn:
        data = json.loads(line)
        dataset_name = data['query'].split('_')[0]
        
        train_sample_count[dataset_name] = train_sample_count.get(dataset_name, 0) + 1

In [5]:
for k, v in train_sample_count.items():
    print(k, v)

tvpl 165347
zalolegal 2556
msmarco 457361
squadv2 60942


In [7]:
eval_dataset_names = ["law", "zalo_legal", "zalo_qa"]
for name in eval_dataset_names:
    dev_rel_docs = json.load(open(f"../data/eval/{name}/dev_rel_docs.json", encoding='utf-8'))
    print(name, len(dev_rel_docs))

law 9992
zalo_legal 640
zalo_qa 4399


In [8]:
for name in ["law", "zalo_legal", "zalo_qa"]:
    filtered_corpus = json.load(open(f"../data/eval/{name}/filtered_corpus.json", encoding='utf-8'))
    print(name, len(filtered_corpus))

law 224008
zalo_legal 61060
zalo_qa 15957


### Number of relevant documents

In [5]:
import json
import os
from datasets import load_dataset
import matplotlib.pyplot as plt

In [2]:
num_rel_docs = {
    "tvpl": [],
    "zalolegal": [],
    "msmarco": [],
    "squadv2": []
}

with open("../data/cross_lingual/merged_train_data_ids.jsonl") as fIn:
    for line in fIn:
        data = json.loads(line)
        dataset_name = data['query'].split('_')[0]
        
        num_rel_docs[dataset_name].append(len(data['pos']))

In [38]:
from collections import Counter
c = Counter(num_rel_docs['squadv2'])
c

Counter({1: 60942})

In [37]:
515 + 112 + 16 + 7

650

In [25]:
l = []
dev_rel_docs = json.load(open("../data/eval/zalo_qa/dev_rel_docs.json", encoding='utf-8'))
for k, v in dev_rel_docs.items():
    l.append(len(v))
Counter(l)

Counter({1: 3460, 2: 682, 3: 175, 4: 52, 5: 19, 6: 7, 7: 4})

In [76]:
import json
from datasets import load_dataset

train_ids = load_dataset("json", data_files="../data/cross_lingual/merged_train_data_ids.jsonl", split="train")
train_ids[630042]

{'query': 'squadv2_4778',
 'pos': ['squadv2_768'],
 'neg': ['squadv2_702',
  'squadv2_767',
  'squadv2_696',
  'squadv2_766',
  'squadv2_712']}

In [26]:
queries_vi = json.load(open("../data/cross_lingual/merged_queries_vi.json", encoding='utf-8'))
queries_en = json.load(open("../data/cross_lingual/merged_queries_en.json", encoding='utf-8'))
corpus_vi = json.load(open("../data/cross_lingual/merged_corpus_vi.json", encoding='utf-8'))
corpus_en = json.load(open("../data/cross_lingual/merged_corpus_en.json", encoding='utf-8'))

KeyboardInterrupt: 

In [79]:
qid = 'squadv2_4778'
pid = 'squadv2_768'
npid = 'squadv2_702'
queries_vi[qid], corpus_vi[pid], corpus_vi[npid]

('Kanye West đã bán được tổng cộng bao nhiêu bài hát kỹ thuật số ở Mỹ?',
 'Sáu album phòng thu solo đầu tiên của West, tất cả đều đạt đĩa bạch kim, đã nhận được nhiều giải thưởng và sự hoan nghênh của giới phê bình. Tất cả các album của anh đều thành công về mặt thương mại, trong đó Yeezus, album solo thứ sáu của anh, trở thành album quán quân thứ năm liên tiếp tại Mỹ khi phát hành. West đã có sáu bài hát vượt quá 3 triệu lượt bán kỹ thuật số tính đến tháng 12 năm 2012, với "Gold Digger" bán được 3.086.000, "Stronger" bán được 4.402.000, "Heartless" bán được 3.742.000, "E.T." bán được hơn 4.000.000, "Love Lockdown" bán được hơn 3.000.000 và "Niggas in Paris" bán được hơn 3.000.000, giúp anh đứng thứ ba về tổng doanh số bán kỹ thuật số trong thập kỷ qua. Anh đã bán được hơn 30 triệu bài hát kỹ thuật số tại Hoa Kỳ, khiến anh trở thành một trong những nghệ sĩ kỹ thuật số bán chạy nhất mọi thời đại.',
 'West đã dành phần lớn thời gian cuối những năm 1990 để sản xuất đĩa hát cho một số nghệ