### MSMARCO

In [52]:
import gzip
from tqdm import tqdm
import json
import pandas as pd
import numpy as np

msmarco_triplets_filepath = "../data/original/msmarco/msmarco-hard-negatives.jsonl.gz"
ce_score_margin = 3
num_negs_per_system = 3

# queries = pd.read_csv("../data/original/msmarco/queries_translated.train.tsv", sep='\t',
#                       header=None, names=['id', 'text'], index_col='id',
#                       encoding='utf-8')['text']

# queries_en = pd.read_csv("../data/original/msmarco/queries.train.tsv", sep='\t',
#                       header=None, names=['id', 'text'], index_col='id',
#                       encoding='utf-8')['text']

# corpus = pd.read_csv("../data/original/msmarco/collection_translated.tsv",
#                      sep='\t', header=None, names=['id', 'text'], index_col='id', encoding='utf-8')['text']

# corpus_en = pd.read_csv("../data/original/msmarco/collection.tsv",
#                      sep='\t', header=None, names=['id', 'text'], index_col='id', encoding='utf-8')['text']

with open("../data/original/msmarco/idx.npy", 'rb') as f:
    filtered_idx = np.load(f)

msmarco_train_data = []
with gzip.open(msmarco_triplets_filepath, 'rt', encoding='utf8') as fIn:
    current_idx = 0
    for line in tqdm(fIn, total=502939):
        if current_idx in filtered_idx:
            data = json.loads(line)
            
            #Get the positive passage ids
            pos_pids = [item['pid'] for item in data['pos']]
            pos_min_ce_score = min([item['ce-score'] for item in data['pos']])
            ce_score_threshold = pos_min_ce_score - ce_score_margin
            
            #Get the hard negatives
            neg_pids = set()
            for system_negs in data['neg'].values():
                negs_added = 0
                for item in system_negs:
                    if item['ce-score'] > ce_score_threshold:
                        continue

                    pid = item['pid']
                    if pid not in neg_pids:
                        neg_pids.add(pid)
                        negs_added += 1
                        if negs_added >= num_negs_per_system:
                            break
            
            if len(pos_pids) > 0 and len(neg_pids) > 0:
                msmarco_train_data.append({
                                        #    'query': queries[int(data['qid'])],
                                        #    'pos': [corpus[int(pos_pid)] for pos_pid in pos_pids],
                                        #    'neg': [corpus[int(neg_pid)] for neg_pid in neg_pids],
                                            'query': data['qid'],
                                            'pos': pos_pids,
                                            'neg': neg_pids
                                          })
        
        current_idx += 1

  0%|          | 99/502939 [00:00<02:53, 2895.21it/s]


('nghệ thuật tự do là gì?',
 'what are the liberal arts?',
 'nghệ thuật tự do. 1. khóa học giảng dạy tại một trường cao đẳng nhằm cung cấp kiến \u200b\u200bthức chung và bao gồm nghệ thuật, nhân văn, khoa học tự nhiên và khoa học xã hội, trái ngược với các môn học chuyên môn hoặc kỹ thuật.',
 'liberal arts. 1. the academic course of instruction at a college intended to provide general knowledge and comprising the arts, humanities, natural sciences, and social sciences, as opposed to professional or technical subjects.')

In [91]:
idx = 1
c = 0
n_idx = 2

with gzip.open(msmarco_triplets_filepath, 'rt', encoding='utf8') as fIn:
    current_idx = 0
    for line in tqdm(fIn, total=502939):
        if current_idx in filtered_idx:
            c += 1
            data = json.loads(line)
            if c == idx:
                break
        
        current_idx += 1
        
qid = data['qid']
pid = data['pos'][0]['pid']
p_ce_score = data['pos'][0]['ce-score']
npid = data['neg']['bm25'][n_idx]['pid']
n_ce_score = data['neg']['bm25'][n_idx]['ce-score'] 

  0%|          | 0/502939 [00:00<?, ?it/s]


In [92]:
print(queries[int(qid)])
print(corpus[int(pid)])
print("positive ce score: ", p_ce_score)
print(corpus[int(npid)])
print("negative ce score: ", n_ce_score)

nghệ thuật tự do là gì?
nghệ thuật tự do. 1. khóa học giảng dạy tại một trường cao đẳng nhằm cung cấp kiến ​​thức chung và bao gồm nghệ thuật, nhân văn, khoa học tự nhiên và khoa học xã hội, trái ngược với các môn học chuyên môn hoặc kỹ thuật.
positive ce score:  10.257502555847168
Nghiên cứu tự do là gì? Nghiên cứu tự do, còn được gọi là nghệ thuật tự do, bao gồm sự khám phá rộng rãi về khoa học xã hội, khoa học tự nhiên, nhân văn và nghệ thuật. Nếu bạn quan tâm đến một nền giáo dục đa dạng về nhân văn, giao tiếp và tư duy, hãy đọc tiếp để tìm hiểu về các khả năng giáo dục và nghề nghiệp trong các nghiên cứu tự do.
negative ce score:  8.866464614868164


In [86]:
data

{'qid': '571018',
 'pos': [{'pid': '7349777', 'ce-score': 10.257502555847168}],
 'neg': {'bm25': [{'pid': '6948601',
    'bm25-score': 26.068258,
    'ce-score': 3.581289291381836},
   {'pid': '5129919', 'bm25-score': 26.007563, 'ce-score': 8.257364273071289},
   {'pid': '6717931', 'bm25-score': 25.934547, 'ce-score': 8.866464614868164},
   {'pid': '1065943', 'bm25-score': 25.258688, 'ce-score': 5.258519172668457},
   {'pid': '1626276', 'bm25-score': 25.153275, 'ce-score': 4.19371223449707},
   {'pid': '981824', 'bm25-score': 24.716213, 'ce-score': 8.563857078552246},
   {'pid': '6449111', 'bm25-score': 24.62115, 'ce-score': 4.907355785369873},
   {'pid': '1028927', 'bm25-score': 24.54112, 'ce-score': 7.617893695831299},
   {'pid': '2524942', 'bm25-score': 24.355902, 'ce-score': 1.5268436670303345},
   {'pid': '5810175',
    'bm25-score': 24.315733,
    'ce-score': -0.6152520179748535},
   {'pid': '6236527', 'bm25-score': 24.3093, 'ce-score': -2.9456772804260254},
   {'pid': '7179545',

### Only positive

In [None]:
import gzip
from tqdm import tqdm
import json
import pandas as pd
import numpy as np

queries = pd.read_csv("../data/msmarco/queries_translated.train.tsv", sep='\t',
                      header=None, names=["id", "text"], index_col="id",
                      encoding='utf-8')['text']

corpus = pd.read_csv("../data/msmarco/collection_translated.tsv", sep='\t',
                     header=None, names=["id", "text"], index_col="id",
                     encoding='utf-8')['text']

msmarco_train_data = []

index = 0
with gzip.open("../data/msmarco/msmarco-hard-negatives.jsonl.gz", 'rt', encoding='utf8') as fIn:
    for line in tqdm(fIn, total=502939):
        if index in idx:
            sample = json.loads(line)

            train_sample = {}
            train_sample["query"] = queries[int(sample["qid"])].replace("\u200b", "").replace(u'\xa0', u' ')
            train_sample["pos"] = [corpus[int(pos["pid"])].replace("\u200b", "").replace(u'\xa0', u' ') for pos in sample["pos"]]

            msmarco_train_data.append(train_sample)
        index += 1

### SQuADv2

In [None]:
import json
import pandas as pd
from tqdm import tqdm

queries = json.load(open('../data/original/squadv2/queries.json', encoding='utf-8'))
corpus = json.load(open('../data/original/squadv2/collections.json', encoding='utf-8'))
rel_docs = json.load(open('../data/original/squadv2/hard_negatives_biencoder_top5.json', encoding='utf-8'))

In [None]:
squadv2_train_data = []

for qid, pos_neg in tqdm(rel_docs.items()):
    train_sample = {}
    train_sample['query'] = queries[qid]
    train_sample['pos'] = [corpus[pid] for pid in pos_neg['pos']]
    train_sample['neg'] = [corpus[pid] for pid in pos_neg['neg']]

    squadv2_train_data.append(train_sample)

In [None]:
squadv2_train_data[10]

### Save data

In [None]:
import random
merge_data = msmarco_train_data + squadv2_train_data
random.shuffle(merge_data)

with open("../data/sft/msmarco_squadv2_splitted/msmarco_squadv2_len-0-512.jsonl", "w", encoding="utf-8") as fOut:
    for data in merge_data:
        fOut.write(json.dumps(data, ensure_ascii=False) + '\n')

In [None]:
from huggingface_hub import HfApi
api = HfApi()
# api.upload_file(
#     path_or_fileobj="../data/final/train/generic/msmarco_squadv2_train_data_len-0-512.jsonl",
#     path_in_repo="train_generic/msmarco_squadv2_train_data_len-0-512.jsonl",
#     repo_id="nntoan209/GenericTraining",
#     repo_type="dataset"
# )

In [None]:
api.upload_folder(
    folder_path="../data/eval/law",
    path_in_repo=".",
    repo_id="nntoan209/LawEval",
    repo_type="dataset"
)

### MSMARCO Translation

In [None]:
import json
import gzip
import tarfile
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# with tarfile.open("../data/original/msmarco/queries.tar.gz", "r:gz") as tar:
#     tar.extractall(path="../data/original/msmarco")
# with tarfile.open("../data/original/msmarco/collection.tar.gz", "r:gz") as tar:
#     tar.extractall(path="../data/original/msmarco")

In [None]:
queries_vi = {}
queries_en = {}

with open("../data/original/msmarco/queries_translated.train.tsv", "r", encoding="utf8") as fIn:
    for line in fIn:
        qid, query = line.strip().split("\t")
        qid = int(qid)
        queries_vi[qid] = query
        
with open("../data/original/msmarco/queries.train.tsv", "r", encoding="utf8") as fIn:
    for line in fIn:
        qid, query = line.strip().split("\t")
        qid = int(qid)
        if qid in queries_vi:
            queries_en[qid] = query

In [None]:
len(queries_vi), len(queries_en)

In [None]:
with open("../data/eval/msmarco/queries.json", "w", encoding="utf-8") as fOut:
    json.dump(queries_vi, fOut, ensure_ascii=False, indent=2)
    
with open("../data/eval/msmarco/queries_en.json", "w", encoding="utf-8") as fOut:
    json.dump(queries_en, fOut, ensure_ascii=False, indent=2)

In [None]:
msmarco_triplets_filepath = "../data/original/msmarco/msmarco-hard-negatives.jsonl.gz"
ce_score_margin = 3
num_negs_per_system = 3

with open("../data/original/msmarco/idx.npy", 'rb') as f:
    filtered_idx = np.load(f)

msmarco_train_data = []
with gzip.open(msmarco_triplets_filepath, 'rt', encoding='utf8') as fIn:
    current_idx = 0
    for line in tqdm(fIn, total=502939):
        if current_idx in filtered_idx:
            data = json.loads(line)
            
            #Get the positive passage ids
            pos_pids = [item['pid'] for item in data['pos']]
            pos_min_ce_score = min([item['ce-score'] for item in data['pos']])
            ce_score_threshold = pos_min_ce_score - ce_score_margin
            
            #Get the hard negatives
            neg_pids = set()
            for system_negs in data['neg'].values():
                negs_added = 0
                for item in system_negs:
                    if item['ce-score'] > ce_score_threshold:
                        continue

                    pid = item['pid']
                    if pid not in neg_pids:
                        neg_pids.add(pid)
                        negs_added += 1
                        if negs_added >= num_negs_per_system:
                            break
            
            if len(pos_pids) > 0 and len(neg_pids) > 0:
                msmarco_train_data.append({'query': str(data['qid']),
                                            'pos': pos_pids,
                                            'neg': neg_pids})
        
        current_idx += 1

In [None]:
all_pids = set()
for item in msmarco_train_data:
    all_pids.update(item['pos'])
    all_pids.update(item['neg'])

In [None]:
collection_vi = {}
collection_en = {}

with open("../data/original/msmarco/collection_translated.tsv", "r", encoding='utf-8') as fIn:
    for line in fIn:
        pid, passage = line.strip().split("\t")
        if str(pid) in all_pids:
            pid = int(pid)
            collection_vi[pid] = passage
        
with open("../data/original/msmarco/collection.tsv", "r", encoding='utf-8') as fIn:
    for line in fIn:
        pid, passage = line.strip().split("\t")
        pid = int(pid)
        if pid in collection_vi:
            collection_en[pid] = passage

In [None]:
with open("../data/eval/msmarco/collections.json", "w", encoding="utf-8") as fOut:
    json.dump(collection_vi, fOut, ensure_ascii=False, indent=2)
with open("../data/eval/msmarco/collections_en.json", "w", encoding="utf-8") as fOut:
    json.dump(collection_en, fOut, ensure_ascii=False, indent=2)

In [None]:
with open("../data/sft/msmarco_train_data_minedHN.jsonl", "w", encoding="utf-8") as fOut:
    for data in msmarco_train_data:
        data['neg'] = list(data['neg'])
        fOut.write(json.dumps(data, ensure_ascii=False) + '\n')

### SQuADv2 Translation

In [1]:

from datasets import load_dataset
import json

# original_dataset = load_dataset("parquet", data_files="../data/original/squadv2/train-00000-of-00001.parquet", split="train")
original_dataset = json.load(open("../data/original/squadv2/train-v2.0.json", encoding='utf-8'))['data'] \
                 + json.load(open("../data/original/squadv2/dev-v2.0.json", encoding='utf-8'))['data']
collection_vi = json.load(open("../data/original/squadv2/collections.json", encoding="utf-8"))
queries_vi = json.load(open("../data/original/squadv2/queries.json", encoding='utf-8'))
hard_neg = json.load(open("../data/original/squadv2/hard_negatives_biencoder_top5.json"))
# squadv2_translated_train = json.load(open("../data/original/squadv2/train-v2.0-translated.json", encoding='utf-8'))['data']
# squadv2_translated_dev = json.load(open("../data/original/squadv2/dev-v2.0-translated.json", encoding='utf-8'))['data']
squadv2_translated = json.load(open("../data/original/squadv2/train-v2.0-translated.json", encoding='utf-8'))['data'] \
                    + json.load(open("../data/original/squadv2/dev-v2.0-translated.json", encoding='utf-8'))['data']

In [10]:
title_vi = [item['title'] for item in squadv2_translated]
len(title_vi)

310

In [11]:
original_dataset_align = [item for item in original_dataset if item['title'] in title_vi]
len(original_dataset_align)

310

In [18]:
len(queries_vi)

60942

In [33]:
queries_en = {}
idx = 0
for item in original_dataset_align:
    for p in item['paragraphs']:
        for q in p['qas']:
            if not q['is_impossible']:
                queries_en[str(idx)] = q['question']
                idx += 1

In [36]:
with open("../data/eval/squadv2/queries.json", "w", encoding='utf-8') as f:
    json.dump(queries_vi, f, ensure_ascii=False, indent=2)
with open("../data/eval/squadv2/queries_en.json", "w", encoding='utf-8') as f:
    json.dump(queries_en, f, ensure_ascii=False, indent=2)

In [38]:
len(collection_vi)

13317

In [43]:
collection_en = {}
idx = 0
for item in original_dataset_align:
    for p in item['paragraphs']:
        collection_en[str(idx)] = p['context']
        idx += 1

In [44]:
len(collection_en)

13317

In [54]:
with open("../data/eval/squadv2/collections.json", "w", encoding='utf-8') as f:
    json.dump(collection_vi, f, ensure_ascii=False, indent=2)
with open("../data/eval/squadv2/collections_en.json", "w", encoding='utf-8') as f:
    json.dump(collection_en, f, ensure_ascii=False, indent=2)

In [3]:
squadv2_train_data_ids = []
for k, v in hard_neg.items():
    squadv2_train_data_ids.append({
        "query": k,
        "pos": v['pos'],
        "neg": v['neg']
    })

In [6]:
with open("../data/sft/squadv2_train_data_minedHN_ids.json", "w", encoding='utf-8') as f:
    for item in squadv2_train_data_ids:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')