### SFT finetuning

In [None]:
import os
import json
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets

In [None]:
original_corpus = {}

zalo_corpus = load_dataset("parquet", data_dir="../data/tvpl_new/dedup",
                            data_files="zalo_legal_corpus.parquet",
                            split="train",
                            num_proc=16)

for item in tqdm(zalo_corpus):
    original_corpus[item['oid']] = item['title'] + " " + item['text']

sft_corpus = load_dataset("parquet", data_dir="../data/tvpl_new/dedup",
                    data_files=["sft_train.parquet", "sft_test.parquet"],
                    split="train",
                    num_proc=16)

for item in tqdm(sft_corpus):
    original_corpus[item['oid']] = item['text']

tvpl_corpus = load_dataset("parquet", data_dir="../data/tvpl_new/dedup",
                            data_files=["tvpl_dataset.parquet"],
                            split="train",
                            num_proc=16)

for law in tqdm(tvpl_corpus):
    for item in law['child_data']:
        original_corpus[item['oid']] = item['text']

with open("../data/final/corpus/original_corpus.json", "w", encoding="utf-8") as fOut:
    json.dump(original_corpus, fOut, indent=2)

In [None]:
filtered_corpus = load_dataset("parquet", data_dir="../data/tvpl_new/dedup",
                               data_files="filtered_corpus.parquet",
                               split="train",
                               num_proc=16)

filtered_corpus_indexed = {}
for item in tqdm(filtered_corpus):
    filtered_corpus_indexed[str(item['__cluster__'])] = original_corpus[str(item['oid'])]

with open("../data/final/corpus/filtered_corpus.json", "w", encoding="utf-8") as fOut:
    json.dump(filtered_corpus_indexed, fOut, indent=2)

In [None]:
train_split = load_dataset("parquet",
                          data_dir="../data/tvpl_new/dedup/newtraintestdivide",
                          data_files="train.parquet",
                          split="train",
                          num_proc=16)

train_data = []

for item in tqdm(train_split):
    train_sample = {}
    train_sample['query'] = item['question']
    train_sample['pos'] = [original_corpus[str(item['oid'])]]

    train_data.append(train_sample)

with open("../data/final/train/train_data.jsonl", "w", encoding="utf-8") as fOut:
    for data in train_data:
        fOut.write(json.dumps(data, ensure_ascii=False) + '\n')


In [None]:
test_split = load_dataset("parquet",
                          data_dir="../data/tvpl_new/dedup/newtraintestdivide",
                          data_files="test.parquet",
                          split="train",
                          num_proc=16)

dev_queries = {}
dev_rel_docs = {}

index = 0
for item in tqdm(test_split):
    dev_queries[f"query_{index}"] = item['question']
    dev_rel_docs[f"query_{index}"] = str(item['__cluster__'])

    index += 1

with open("../data/final/test/dev_queries.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_queries, fOut, indent=2)

with open("../data/final/test/dev_rel_docs.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_rel_docs, fOut, indent=2)


### Generic pre-training

In [None]:
from datasets import load_dataset
import json
import pandas as pd
import numpy as np
import os
import random

In [None]:
structured_data_doc = load_dataset("parquet", data_files="../data/tvpl_new/structured_data_doc.parquet",
                                   split="train", num_proc=32)
structured_data_doc

In [None]:
### Create a temporary dataset, each item is a chunk in sample['data']
temp_dataset = structured_data_doc.map(lambda x: {"chunk_text": [chunk_data['full_text'] for chunk_data in x['data'][0]]},
                                       batched=True, batch_size=1, remove_columns=structured_data_doc.column_names, num_proc=16)

In [None]:
temp_dataset[0]

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
text_splitter = RecursiveCharacterTextSplitter(
    separators=[".", ";", ",", " ", ""],
    chunk_size=256,
    chunk_overlap=0,
    length_function=lambda x: len(tokenizer(x)["input_ids"]),
    is_separator_regex=False,
    keep_separator=False
)

In [None]:
from string import punctuation

def process_dataset(sample):
    retry = 0
    texts = text_splitter.split_text(sample['chunk_text'])
    
    # sample query and positive
    if len(texts) == 1:
        return {"query": None, "pos": None}
    query, pos = random.sample(texts, 2)
    while (len(tokenizer(query)['input_ids']) < 64) or (len(tokenizer(pos)['input_ids']) < 64):
        if retry > 500:
            retry = 0
            return {"query": None, "pos": None}
        query, pos = random.sample(texts, 2)
        retry += 1
    
    # remove leading punctuation
    query = query.lstrip(punctuation + " ")
    pos = pos.lstrip(punctuation + " ")

    return {"query": query, "pos": [pos]}
    

In [None]:
final_dataset = temp_dataset.map(process_dataset,
                                 remove_columns="chunk_text",
                                 num_proc=32)

def filter_func(sample):
    if (sample['query'] is None) or (sample['pos'] is None):
        return False
    if len(tokenizer(sample['query'])['input_ids']) > 8192 or len(tokenizer(sample['pos'][0])['input_ids']) > 8192:
        return False
    return True
final_dataset = final_dataset.filter(filter_func, num_proc=32)

In [None]:
final_dataset

In [None]:
final_dataset.to_json("../data/final/train/generic/structured_data_doc.jsonl", orient="records", lines=True, num_proc=16)

### New SFT data

In [None]:
from datasets import load_dataset
from tqdm import tqdm
from collections import Counter
import json

train_data = load_dataset("parquet", data_files="../data/original/tvpl/dedup/tvpl_sft_resplit/train.parquet",
                          split="train", num_proc=16)

test_data = load_dataset("parquet", data_files="../data/original/tvpl/dedup/tvpl_sft_resplit/test.parquet",
                         split="train", num_proc=16)

filtered_corpus = json.load(open("../data/eval/law/filtered_corpus.json", encoding='utf-8'))
# filtered_corpus = load_dataset("parquet", data_files="../data/original/tvpl/dedup/merged_corpus/filtered_corpus.parquet", split="train")
reindexed_corpus = json.load(open("../data/eval/law/reindexed_corpus.json", encoding='utf-8'))

In [None]:
train_data[20]

In [None]:
filtered_corpus['41938']

In [None]:
reindexed_corpus['61437']

In [None]:
filtered_corpus_indexed = {}
for item in tqdm(filtered_corpus):
    filtered_corpus_indexed[str(item['__cluster__'])] = item['text'].replace(u'\xa0' or '\u200b', '')

with open("../data/eval/law/filtered_corpus.json", "w", encoding="utf-8") as fOut:
    json.dump(filtered_corpus_indexed, fOut, indent=2)

reindexed_corpus_indexed = {}
for item in tqdm(reindexed_corpus):
    reindexed_corpus_indexed[str(item['oid'])] = item['text'].replace(u'\xa0' or '\u200b', '')
    
with open("../data/eval/law/reindexed_corpus.json", "w", encoding="utf-8") as fOut:
    json.dump(reindexed_corpus_indexed, fOut, indent=2)

In [None]:
dev_queries = {}
dev_rel_docs = {}

index = 0
for item in tqdm(test_data):
    if item['question'] not in dev_queries:
        dev_queries[item['question']] = f"query_{index}"
        dev_rel_docs[f"query_{index}"] = set(str(cluster) for cluster in item['__context_cluster__'])
        index += 1
    else:
        for p in item['__context_cluster__']:
            dev_rel_docs[dev_queries[item['question']]].add(str(p))

dev_queries = {v: k for k,v in dev_queries.items()}
dev_rel_docs = {k: list(v) for k, v in dev_rel_docs.items()}

In [None]:
with open("../data/eval/law/dev_queries.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_queries, fOut, indent=2)

with open("../data/eval/law/dev_rel_docs.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_rel_docs, fOut, indent=2)

In [None]:
a = []
flag = 0
for data in tqdm(train_data):
    for c in data["__context_cluster__"]:
        if str(c) not in filtered_corpus:
            a.append(data['context'])
a

In [None]:
### Clean the train data
def map_func(sample):
    assert len(sample['__context_cluster__']) == len(sample['contextoid'])
    new_contextoid = []
    new_context_cluster = []
    for contextoid, context_cluster in zip(sample['contextoid'], sample['__context_cluster__']):
        if str(context_cluster) in filtered_corpus:
            new_contextoid.append(contextoid)
            new_context_cluster.append(context_cluster)
    sample['contextoid'] = new_contextoid
    sample['__context_cluster__'] = new_context_cluster
    return sample

train_data_clean = train_data.map(map_func, num_proc=16).filter(lambda x: len(x['__context_cluster__']) >= 1, num_proc=16)
train_data_clean

In [None]:
train_data_dict = {}

for item in tqdm(train_data_clean):
    if item['question'] not in train_data_dict:
        train_data_dict[item['question']] = [set(item['contextoid']), set(item['__context_cluster__'])]
    else:
        for contextoid in item['contextoid']:
            train_data_dict[item['question']][0].add(contextoid)
        for context_cluster in item['__context_cluster__']:
            train_data_dict[item['question']][1].add(context_cluster)

train_data_json = []
for k, v in train_data_dict.items():
    train_data_json.append({"query": k,
                            "pos": [reindexed_corpus[str(context_id)] for context_id in v[0]],
                            "__context_cluster__": list(v[1])})        

In [None]:
train_data_json[15:20]

In [None]:
with open("../data/sft/train_data.jsonl", "w", encoding="utf-8") as fOut:
    for data in train_data_json:
        fOut.write(json.dumps(data, ensure_ascii=False) + '\n')

In [None]:
from datasets import load_dataset
import json

ds = load_dataset("json", data_files="../data/sft/train_data_minedHN_new.jsonl", split="train", cache_dir="../.cache")

In [None]:
from tqdm import tqdm

invert_filtered_corpus = {v: k for k, v in filtered_corpus.items()}

for i, item in tqdm(enumerate(ds)):
    for neg in item['neg']:
        if neg in item['pos']:
            print(item)
            break
        if invert_filtered_corpus[neg] in train_data_json[i]['__context_cluster__']:
            print(item)
            break

### Query, Answer, Context

In [None]:
from datasets import load_dataset
import json
from tqdm import tqdm

test_data = load_dataset("parquet", data_files="../data/original/tvpl/dedup/tvpl_sft_resplit/test.parquet",
                         split="train", num_proc=16)

filtered_corpus = json.load(open("../data/eval/law/filtered_corpus.json", encoding='utf-8'))

In [None]:
test_data[0]

In [None]:
dev_queries = {}
dev_queries_answer = {}

index = 0
for item in tqdm(test_data):
    if item['question'] not in dev_queries:
        dev_queries[item['question']] = f"query_{index}"
        dev_queries_answer[f"query_{index}"] = set([item['long_answer']])
        index += 1
    else:
        for p in item['__context_cluster__']:
            dev_queries_answer[dev_queries[item['question']]].add(item['long_answer'])

dev_queries = {v: k for k,v in dev_queries.items()}
dev_queries_answer = {k: list(v) for k, v in dev_queries_answer.items()}

In [None]:
with open("../data/eval/law/dev_answers.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_queries_answer, fOut, indent=2)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="../data/eval/law/dev_answers.json",
    repo_id="nntoan209/LawEval",
    path_in_repo="dev_answers.json",
    repo_type="dataset"
)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path="../saved_models/bgem3_sft_20240601/checkpoint-29610",
    repo_id="nntoan209/bgem3-sft-msmarco-squadv2-1gpu-e3",
    repo_type="model",
    ignore_patterns="global_step29610/*"
)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path="../data/sft/splitted",
    repo_id="nntoan209/LawFinetuneV2",
    repo_type="dataset"
)

In [None]:
from datasets import load_dataset
import os

for file in os.listdir("../data/sft/tvpl_splitted"):
    if file.endswith(".jsonl"):
        print(file)
        ds = load_dataset("json", data_files=f"../data/sft/tvpl_splitted/{file}", split="train")
        filtered_ds = ds.filter(lambda x: len(x['pos']) >= 1, num_proc=32)
        
        # Save the filter_ds
        if len(filtered_ds) < len(ds):
            print(f"Filtered {len(ds) - len(filtered_ds)} samples")
            filtered_ds.to_json(f"../data/sft/tvpl_splitted/{file}", orient="records", lines=True, num_proc=32)

### Zalo legal 2021

In [None]:
import json
import numpy as np
from datasets import load_dataset
from tqdm import tqdm

train_question_answer = json.load(open("../data/original/zalo_legal_2021/train_question_answer.json", encoding="utf-8"))['items'][-640:]
filtered_corpus = json.load(open("../data/eval/law/filtered_corpus.json", encoding='utf-8'))
zalo_legal_corpus = load_dataset("parquet", data_files="../data/original/tvpl/dedup/data_remapped/zalo_legal_corpus", split="train")
reindexed_corpus = load_dataset("parquet", data_files="../data/original/tvpl/dedup/merged_corpus/reindexed_corpus.parquet", split="train").filter(lambda x: x['dataset'] == "zalo_legal_corpus", num_proc=16)

In [None]:
zalo_legal_corpus[0]

In [None]:
filtered_corpus_to_zalo_corpus = {str(i): [] for i in filtered_corpus.keys()}
for item in tqdm(zalo_legal_corpus):
    zalo_id = item['law_id'] + "_" + item['article_id']
    if str(item['__cluster__']) in filtered_corpus_to_zalo_corpus:
        filtered_corpus_to_zalo_corpus[str(item['__cluster__'])].append(zalo_id)
for k, v in filtered_corpus_to_zalo_corpus.items():
    filtered_corpus_to_zalo_corpus[k] = list(set(v))
    if len(filtered_corpus_to_zalo_corpus[k]) == 0:
        filtered_corpus_to_zalo_corpus[k] = ["NOT FOUND"]
len(filtered_corpus_to_zalo_corpus)

In [None]:
with open("../data/eval/zalo_legal/filtered_corpus_to_zalo_corpus.json", "w", encoding="utf-8") as fOut:
    json.dump(filtered_corpus_to_zalo_corpus, fOut, indent=2, ensure_ascii=False)

In [None]:
corpus = {}
dev_queries = {}
dev_rel_docs = {}

In [None]:
zalo_legal_corpus_indexed = {}
for item in tqdm(zalo_legal_corpus):
    article_id = item['article_id']
    law_id = item['law_id']
    
    zalo_legal_corpus_indexed[str(item['oid'])] = [f"{law_id}_{article_id}", item['text'].replace(u'\xa0' or '\u200b', '')]
    
for item in tqdm(reindexed_corpus):
    a = zalo_legal_corpus_indexed[str(item['oid'])]
    corpus[a[0]] = a[1]    

In [None]:
len(corpus)

In [None]:
with open("../data/eval/zalo_legal/filtered_corpus.json", "w", encoding="utf-8") as fOut:
    json.dump(corpus, fOut, indent=2, ensure_ascii=False)

In [None]:
for question_answer in tqdm(train_question_answer):
    rel_docs = []
    for article in question_answer['relevant_articles']:
        law_id = article['law_id']
        article_id = article['article_id']
        rel_docs.append(f"{law_id}_{article_id}")
    
    dev_queries[question_answer['question_id']] = question_answer['question']
    dev_rel_docs[question_answer['question_id']] = rel_docs

In [None]:
with open("../data/eval/zalo_legal/dev_queries.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_queries, fOut, indent=2, ensure_ascii=False)
with open("../data/eval/zalo_legal/dev_rel_docs.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_rel_docs, fOut, indent=2, ensure_ascii=False)

### Zalo QA

In [None]:
import json

with open("../data/train_generic/zaloqa/train.json", encoding='utf-8') as f:
    data = json.load(f)
    
data[0]

In [None]:
corpus = {}
dev_queries = {}
dev_rel_docs = {}

In [None]:
corpus_index = 0

for item in data:
    if item['text'] not in corpus:
        corpus[item['text']] = f"doc_{corpus_index}"
        corpus_index += 1

In [None]:
query_index = 0
for item in data:
    if item['question'] not in dev_queries and item['label']:
        dev_queries[item['question']] = f"query_{query_index}"
        dev_rel_docs[f"query_{query_index}"] = [corpus[item['text']]]
        query_index += 1
    elif item['question'] in dev_queries and item['label']:
        dev_rel_docs[dev_queries[item['question']]].append(corpus[item['text']])
        
dev_queries = {v: k for k, v in dev_queries.items()}
dev_rel_docs = {k: list(set(v)) for k, v in dev_rel_docs.items()}
corpus = {v: k for k, v in corpus.items()}

In [None]:
len(dev_queries)

In [None]:
len(dev_rel_docs)

In [None]:
len(corpus)

In [None]:
l = [item['text'] for item in data]
len(set(l))

In [None]:
i = 0
print(dev_queries[f"query_{i}"])
list(corpus[i] for i in dev_rel_docs[f"query_{i}"])

In [None]:
with open("../data/eval/zalo_qa/dev_queries.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_queries, fOut, ensure_ascii=False, indent=2)
with open("../data/eval/zalo_qa/dev_rel_docs.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_rel_docs, fOut, indent=2)
with open("../data/eval/zalo_qa/filtered_corpus.json", "w", encoding="utf-8") as fOut:
    json.dump(corpus, fOut, ensure_ascii=False, indent=2)

### Translate queries

In [None]:
import os
import json
from datasets import load_dataset

In [None]:
train_data = load_dataset("json", data_files="../data/sft/train_data.jsonl",
                          split="train", num_proc=16)

dev_queries = json.load(open("../data/eval/law/dev_queries.json", encoding="utf-8"))
zalo_qa_queries = json.load(open("../data/eval/zalo_qa/dev_queries.json", encoding="utf-8"))

In [None]:
queries_to_translate = {}

In [None]:
index = 0
for item in train_data:
    queries_to_translate[f"train_{index}"] = item["query"]
    index += 1
    
for k, v in dev_queries.items():
    queries_to_translate[f"dev_{k}"] = v

for k, v in zalo_qa_queries.items():
    queries_to_translate[f"zaloqa_{k}"] = v

In [None]:
with open("../data/eval/queries_to_translate.json", "w", encoding="utf-8") as fOut:
    json.dump(queries_to_translate, fOut, indent=2, ensure_ascii=False)

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_file(
    path_or_fileobj="../data/eval/queries_to_translate.json",
    repo_id="nntoan209/law_queries_translated",
    path_in_repo="queries_to_translate.json",
    repo_type="dataset"
)

In [None]:
from datasets import load_dataset
import json
import os

queries_en = json.load(open("../data/law_translated/queries/queries_en.json"))

In [None]:
dev_queries_en = {}
train_queries_en = {}

In [None]:
for k, v in queries_en.items():
    if k.startswith("train"):
        query_id = k.split("_")[1]
        train_queries_en[query_id] = v
    elif k.startswith("dev"):
        query_id = "_".join(k.split("_")[1:])
        dev_queries_en[query_id] = v

In [None]:
# sort the train_queries_en and dev_queries_en by keys:
train_queries_en = {k: train_queries_en[k] for k in sorted(train_queries_en, key=lambda x: int(x))}
dev_queries_en = {k: dev_queries_en[k] for k in sorted(dev_queries_en, key=lambda x: int(x.split("_")[-1]))}

In [None]:
import json
with open("../data/eval/law/train_queries_en.json", "w", encoding="utf-8") as fOut:
    json.dump(train_queries_en, fOut, indent=2, ensure_ascii=False)
with open("../data/eval/law/dev_queries_en.json", "w", encoding="utf-8") as fOut:
    json.dump(dev_queries_en, fOut, indent=2, ensure_ascii=False)

In [6]:
import json
from tqdm import tqdm

all_queries_vi = json.load(open("../data/law_translated/queries/queries_vi.json", encoding='utf-8'))
all_queries_en = json.load(open("../data/law_translated/queries/queries_en.json"))
all_queries_vi_invert = {v: k for k, v in all_queries_vi.items()}

In [5]:
tvpl_queries = {}
tvpl_queries_en = {}

In [8]:
with open("../data/sft/train_data_minedHN_v3.jsonl", encoding="utf-8") as f:
    for line in tqdm(f):
        item = json.loads(line)

        query_idx = all_queries_vi_invert[item['query']]
        tvpl_queries[query_idx] = item['query']
        tvpl_queries_en[query_idx] = all_queries_en[query_idx]

165347it [00:00, 205591.12it/s]


In [14]:
with open("../data/eval/law/train_queries.json", "w", encoding='utf-8') as f:
    json.dump(tvpl_queries, f, indent=2, ensure_ascii=False)
    
with open("../data/eval/law/train_queries_en.json", "w") as f:
    json.dump(tvpl_queries_en, f, indent=2, ensure_ascii=False)

### Translate positive passages

In [None]:
from datasets import load_dataset
from tqdm import tqdm
import json

train_data = load_dataset("parquet", data_files="../data/original/tvpl/dedup/tvpl_sft_resplit/train.parquet",
                          split="train", num_proc=16)

filtered_corpus = json.load(open("../data/eval/law/filtered_corpus.json", encoding='utf-8'))
filtered_corpus_invert = {v: k for k, v in filtered_corpus.items()}
reindexed_corpus = json.load(open("../data/eval/law/reindexed_corpus.json", encoding='utf-8'))

In [None]:
### Clean the train data
def map_func(sample):
    assert len(sample['__context_cluster__']) == len(sample['contextoid'])
    new_contextoid = []
    new_context_cluster = []
    for contextoid, context_cluster in zip(sample['contextoid'], sample['__context_cluster__']):
        # Remove the __context_cluster__ that is not in the filtered corpus
        if str(context_cluster) in filtered_corpus:
            new_contextoid.append(contextoid)
            new_context_cluster.append(context_cluster)
    sample['contextoid'] = new_contextoid
    sample['__context_cluster__'] = new_context_cluster
    return sample

# Remove the samples that have no positive passages
train_data_clean = train_data.map(map_func, num_proc=16).filter(lambda x: len(x['__context_cluster__']) >= 1, num_proc=16)
train_data_clean

In [None]:
train_data_dict = {}

for item in tqdm(train_data_clean):
    if item['question'] not in train_data_dict:
        train_data_dict[item['question']] = [item['contextoid'], item['__context_cluster__']]
    else:
        for contextoid in item['contextoid']:
            train_data_dict[item['question']][0].append(contextoid)
        for context_cluster in item['__context_cluster__']:
            train_data_dict[item['question']][1].append(context_cluster)      

In [None]:
for k, v in tqdm(train_data_dict.items()):
    context_oid = list(v[0])
    context_cluster = list(v[1])
    
    new_context_oid = []
    for oid, cluster in zip(context_oid, context_cluster):
        if reindexed_corpus[str(oid)] != filtered_corpus[str(cluster)]:
            new_context_oid.append(f"oid_{oid}")
        else:
            new_context_oid.append(f"cluster_{cluster}")
            
    train_data_dict[k] = (list(set(new_context_oid)), list(set(context_cluster)))

In [None]:
train_data_dict[list(train_data_dict.keys())[5]]

In [None]:
train_data_json = []
for k, v in train_data_dict.items():
    train_data_json.append({"query": k,
                            "pos": v[0],
                            "__context_cluster__": v[1]})  

In [None]:
with open("../data/sft/train_data.jsonl", "w", encoding="utf-8") as fOut:
    for data in train_data_json:
        fOut.write(json.dumps(data, ensure_ascii=False) + '\n')

In [None]:
additional_passages_to_translate = {}
for v in train_data_dict.values():
    for pid in v[0]:
        if pid.startswith("oid_"):
            additional_passages_to_translate[pid] = reindexed_corpus[pid.split("_")[1]]

In [None]:
additional_passages_to_translate['oid_61426']

In [None]:
with open("../data/law_translated/additional_passages_to_translate.json", "w", encoding="utf-8") as fOut:
    json.dump(additional_passages_to_translate, fOut, indent=2, ensure_ascii=False)

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_file(
    path_or_fileobj="../data/law_translated/additional_passages_to_translate.json",
    repo_id="nntoan209/additional_law_translated",
    path_in_repo="additional_passages_to_translate.json",
    repo_type="dataset"
)

In [None]:
from datasets import load_dataset
import json
import os

ds = load_dataset("json", data_dir="../data/law_translated/corpus",
                  data_files=[file for file in os.listdir("../data/law_translated/corpus") if file.startswith("additional")],
                  split="train", num_proc=16)

ds_to_sort = ds.map(lambda x: {"doc_id": int(x['id'].split("_")[-1])}, num_proc=16).sort("doc_id")

In [None]:
reindexed_corpus_en = {}
for item in ds_to_sort:
    reindexed_corpus_en[str(item['doc_id'])] = item['text']

In [None]:
with open("../data/eval/law/reindexed_corpus_en.json", "w", encoding="utf-8") as fOut:
    json.dump(reindexed_corpus_en, fOut, indent=2, ensure_ascii=False)

### Zalo QA corpus translate

In [None]:
import json

zaloqa_corpus = json.load(open("../data/eval/zalo_qa/filtered_corpus.json", encoding="utf-8"))
zaloqa_corpus['doc_0']

In [None]:
len(zaloqa_corpus)

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_file(
    path_or_fileobj="../data/eval/zalo_qa/filtered_corpus.json",
    repo_id="nntoan209/zaloqa_corpus_translated",
    path_in_repo="filtered_corpus.json",
    repo_type="dataset"
)

In [None]:
from datasets import load_dataset
import os

ds = load_dataset("json", data_dir="../data/law_translated/corpus",
                  data_files=[file for file in os.listdir("../data/law_translated/corpus") if file.startswith("zaloqa_corpus")],
                  split="train", num_proc=16)

ds_to_sort = ds.map(lambda x: {"doc_id": int(x['id'].split("_")[-1])}, num_proc=16).sort("doc_id")

In [None]:
dev_queries_en = {}
filtered_corpus_en = {}

In [None]:
for item in ds_to_sort:
    filtered_corpus_en[item['id']] = item['text']

In [None]:
import json
with open("../data/eval/zalo_qa/filtered_corpus_en.json", "w") as f:
    json.dump(filtered_corpus_en, f, indent=2, ensure_ascii=False)

In [None]:
with open("../data/law_translated/queries/queries_en.json") as f:
    queries_en = json.load(f)

In [None]:
for k, v in queries_en.items():
    if k.startswith("zaloqa_"):
        query_id = "_".join(k.split("_")[1:])
        dev_queries_en[query_id] = v

In [None]:
import json
with open("../data/eval/zalo_qa/dev_queries_en.json", "w") as f:
    json.dump(dev_queries_en, f, indent=2, ensure_ascii=False)

### Translate Zalo Legal

In [None]:
import json
import os

zalolegal_train_data = json.load(open("../data/original/zalo_legal_2021/train_question_answer.json", encoding="utf-8"))['items']
zalolegal_corpus = json.load(open("../data/eval/zalo_legal/filtered_corpus.json", encoding="utf-8"))

In [None]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
api.upload_file(
    path_or_fileobj="../data/eval/zalo_legal/filtered_corpus.json",
    path_in_repo="filtered_corpus.json",
    repo_id="nntoan209/zalolegal_corpus_translated",
    repo_type="dataset"
)

In [None]:
zalolegal_queries = {}
for item in zalolegal_train_data:
    zalolegal_queries[item['question_id']] = item['question']

In [None]:
with open("../data/law_translated/zalolegal_queries_vi.json", "w", encoding="utf-8") as fOut:
    json.dump(zalolegal_queries, fOut, indent=2, ensure_ascii=False)

In [None]:
api.upload_file(
    path_or_fileobj="../data/law_translated/zalolegal_queries_vi.json",
    path_in_repo="zalolegal_queries_vi.json",
    repo_id="nntoan209/zalolegal_queries_translated",
    repo_type="dataset"
)

In [None]:
import json

all_queries_en = json.load(open("../data/law_translated/queries/zalolegal_queries_translated-1-of-1.json"))
all_queries_en = {item['id']: item['text'] for item in all_queries_en}
dev_queries_vi = json.load(open("../data/eval/zalo_legal/dev_queries.json", encoding="utf-8"))

In [None]:
dev_queries_en = {}
for k in dev_queries_vi.keys():
    dev_queries_en[k] = all_queries_en[k]

In [None]:
with open("../data/eval/zalo_legal/dev_queries_en.json", "w") as f:
    json.dump(dev_queries_en, f, indent=2, ensure_ascii=False)

In [None]:
from datasets import load_dataset
import json
import os

filtered_corpus = json.load(open("../data/eval/zalo_legal/filtered_corpus.json", encoding="utf-8"))
ds = load_dataset("json", data_dir="../data/law_translated/corpus",
                  data_files=[file for file in os.listdir("../data/law_translated/corpus") if file.startswith("zalolegal_corpus")],
                  split="train", num_proc=8)
ds = {item['id']: item['text'] for item in ds}

In [None]:
filtered_corpus_en = {}
for k in filtered_corpus.keys():
    filtered_corpus_en[k] = ds[k]

In [None]:
with open("../data/eval/zalo_legal/filtered_corpus_en.json", "w") as f:
    json.dump(filtered_corpus_en, f, indent=2, ensure_ascii=False)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    repo_id="nntoan209/ZaloLegal-CrossLingual",
    repo_type="dataset",
    folder_path="../data/eval/zalo_legal",
)

In [1]:
import json

zalolegal_train = json.load(open("../data/original/zalo_legal_2021/train_question_answer.json", encoding="utf-8"))['items'][:-640]
all_queries_en = json.load(open("../data/law_translated/queries/zalolegal_queries_translated-1-of-1.json"))
all_queries_en = {item['id']: item['text'] for item in all_queries_en}

In [2]:
train_queries = {}
train_queries_en = {}
zalolegal_train_data = []

In [3]:
for item in zalolegal_train:
    train_queries[item['question_id']] = item['question']
    train_queries_en[item['question_id']] = all_queries_en[item['question_id']]
    
    rel_docs = []
    for doc in item['relevant_articles']:
        rel_docs.append(f"{doc['law_id']}_{doc['article_id']}")
    zalolegal_train_data.append({"query": item['question'],
                                 "pos": rel_docs})

In [4]:
with open("../data/sft/zalolegal_train_data.jsonl", "w") as f:
    for item in zalolegal_train_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

In [8]:
with open("../data/eval/zalo_legal/train_queries.json", "w") as f:
    json.dump(train_queries, f, indent=2, ensure_ascii=False)
with open("../data/eval/zalo_legal/train_queries_en.json", "w") as f:
    json.dump(train_queries_en, f, indent=2, ensure_ascii=False)

### Convert TVPL and Zalo Legal train data to all ids

In [8]:
import json

tvpl_train_queries = json.load(open("../data/eval/law/train_queries.json", encoding='utf-8'))
tvpl_train_queries_invert = {v: k for k, v in tvpl_train_queries.items()}
tvpl_train_data = []
with open("../data/sft/train_data_minedHN_v3.jsonl", encoding='utf-8') as fIn:
    for line in fIn:
        item = json.loads(line)
        query_id = tvpl_train_queries_invert[item['query']]
        new_pos = []
        for p in item['pos']:
            if p.startswith("oid_"):
                new_pos.append(p)
            elif p.startswith("cluster_"):
                new_p = p.split("_")[-1]
                new_pos.append(new_p)
                
        tvpl_train_data.append({"query": query_id,
                                "pos": new_pos,
                                "neg": item['neg']})

In [11]:
with open("../data/sft/train_data_minedHN_v3_ids.jsonl", "w") as f:
    for item in tvpl_train_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

In [12]:
zalolegal_train_queries = json.load(open("../data/eval/zalo_legal/train_queries.json", encoding='utf-8'))
zalolegal_train_queries_invert = {v: k for k, v in zalolegal_train_queries.items()}
zalolegal_train_data = []
with open("../data/sft/zalolegal_train_data_minedHN.jsonl", encoding='utf-8') as fIn:
    for line in fIn:
        item = json.loads(line)
        query_id = zalolegal_train_queries_invert[item['query']]
        zalolegal_train_data.append({"query": query_id,
                                     "pos": item['pos'],
                                     "neg": item['neg']})

In [15]:
with open("../data/sft/zalolegal_train_data_minedHN_ids.jsonl", "w") as f:
    for item in zalolegal_train_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')