In [1]:
from config import Param

param = Param()
args = param.args
args

args.task_name = args.dataname

# rel_per_task
args.rel_per_task = 8 if args.dataname == "FewRel" else 4

In [2]:
print(args)

Namespace(gpu=0, dataname='TACRED', task_name='TACRED', device='cuda', batch_size=64, num_tasks=10, rel_per_task=4, pattern='entity_marker', max_length=192, encoder_output_size=768, vocab_size=30522, marker_size=4, num_workers=0, save_checkpoint='./checkpoint/', classifier_lr=0.01, encoder_lr=0.001, prompt_pool_lr=0.001, sgd_momentum=0.1, gmm_num_components=1, pull_constraint_coeff=0.1, classifier_epochs=10, encoder_epochs=10, prompt_pool_epochs=10, replay_s_e_e=256, replay_epochs=100, seed=2021, max_grad_norm=10, data_path='./datasets', bert_path='bert-base-uncased', cov_mat=True, max_num_models=10, sample_freq=5, prompt_length=1, prompt_embed_dim=768, prompt_pool_size=80, prompt_top_k=8, prompt_init='uniform', prompt_key_init='uniform', drop_p=0.1, gradient_accumulation_steps=4, total_round=6, drop_out=0.5, use_gpu=True, hidden_size=768, rank_lora=8, bge_model='BAAI/bge-m3', description_path='/kaggle/input/data-relation/datasets/description/ngoc.json', type_similar='colbert')


In [3]:
import pickle
import random
import json, os
from transformers import BertTokenizer
import numpy as np  


def get_tokenizer(args):
    tokenizer = BertTokenizer.from_pretrained(args.bert_path, additional_special_tokens=["[E11]", "[E12]", "[E21]", "[E22]"])
    return tokenizer


class data_sampler(object):
    def __init__(self, args, seed=None):
        self.set_path(args)
        self.args = args

        # data path
        file_name = "{}.pkl".format("-".join([str(x) for x in [args.dataname, args.seed]]))
        mid_dir = ""
        for temp_p in ["datasets", "_process_path"]:
            mid_dir = os.path.join(mid_dir, temp_p)
            if not os.path.exists(mid_dir):
                os.mkdir(mid_dir)
        self.save_data_path = os.path.join(mid_dir, file_name)

        # import tokenizer
        self.tokenizer = get_tokenizer(args)

        # read relation data
        self.id2rel, self.rel2id = self._read_relations(args.relation_file)

        # random sampling
        self.seed = seed
        if self.seed is not None:
            random.seed(self.seed)
        self.shuffle_index = list(range(len(self.id2rel)))
        random.shuffle(self.shuffle_index)
        self.shuffle_index = np.argsort(self.shuffle_index)

        # regenerate data
        self.training_dataset, self.valid_dataset, self.test_dataset = self._read_data(self.args.data_file)

        # generate the task number
        self.batch = 0
        self.task_length = len(self.id2rel) // self.args.rel_per_task

        # record relations
        self.seen_relations = []
        self.history_test_data = {}
        
        if args.dataname in ["FewRel"]:
            self.id2rel = json.load(open(os.path.join(args.data_path, "id2rel.json"), 'r'))
        else:
            self.id2rel = json.load(open(os.path.join(args.data_path, "id2rel_tacred.json"), 'r'))
        
        self.rel2id = {label: idx for idx, label in enumerate(self.id2rel)}
        

    def set_path(self, args):
        use_marker = ""
        if args.dataname in ["FewRel"]:
            args.data_file = os.path.join(args.data_path, "data_with{}_marker.json".format(use_marker))
            args.relation_file = os.path.join(args.data_path, "id2rel.json")
            args.num_of_relation = 80
            args.num_of_train = 420
            args.num_of_val = 140
            args.num_of_test = 140
            
        elif args.dataname in ["TACRED"]:
            args.data_file = os.path.join(args.data_path, "data_with{}_marker_tacred.json".format(use_marker))
            args.relation_file = os.path.join(args.data_path, "id2rel_tacred.json")
            args.num_of_relation = 40
            args.num_of_train = 420
            args.num_of_val = 140
            args.num_of_test = 140

    def set_seed(self, seed):
        self.seed = seed
        if self.seed != None:
            random.seed(self.seed)
        self.shuffle_index = list(range(len(self.id2rel)))
        random.shuffle(self.shuffle_index)
        self.shuffle_index = np.argsort(self.shuffle_index)

    def __iter__(self):
        return self

    def __next__(self):
        if self.batch == self.task_length:
            raise StopIteration()

        indexs = self.shuffle_index[self.args.rel_per_task * self.batch : self.args.rel_per_task * (self.batch + 1)]
        self.batch += 1

        current_relations = []
        cur_training_data = {}
        cur_valid_data = {}
        cur_test_data = {}

        for index in indexs:
            current_relations.append(self.id2rel[index])
            self.seen_relations.append(self.id2rel[index])
            cur_training_data[self.id2rel[index]] = self.training_dataset[index]
            cur_valid_data[self.id2rel[index]] = self.valid_dataset[index]
            cur_test_data[self.id2rel[index]] = self.test_dataset[index]
            self.history_test_data[self.id2rel[index]] = self.test_dataset[index]

        return cur_training_data, cur_valid_data, cur_test_data, current_relations, self.history_test_data, self.seen_relations

    def _read_data(self, file):
        if os.path.isfile(self.save_data_path):
            with open(self.save_data_path, "rb") as f:
                datas = pickle.load(f)
            train_dataset, val_dataset, test_dataset = datas
            return train_dataset, val_dataset, test_dataset
        else:
            data = json.load(open(file, "r", encoding="utf-8"))
            train_dataset = [[] for i in range(self.args.num_of_relation)]
            val_dataset = [[] for i in range(self.args.num_of_relation)]
            test_dataset = [[] for i in range(self.args.num_of_relation)]
            for relation in data.keys():
                rel_samples = data[relation]
                if self.seed != None:
                    random.seed(self.seed)
                random.shuffle(rel_samples)
                count = 0
                count1 = 0
                for i, sample in enumerate(rel_samples):
                    tokenized_sample = {}
                    tokenized_sample["relation"] = self.rel2id[sample["relation"]]
                    tokenized_sample["text"] = " ".join(sample["tokens"])
                    tokenized_sample["tokens"] = self.tokenizer.encode(" ".join(sample["tokens"]), padding="max_length", truncation=True, max_length=self.args.max_length)


                    if self.args.task_name == "FewRel":
                        if i < self.args.num_of_train:
                            train_dataset[self.rel2id[relation]].append(tokenized_sample)
                        elif i < self.args.num_of_train + self.args.num_of_val:
                            val_dataset[self.rel2id[relation]].append(tokenized_sample)
                        else:
                            test_dataset[self.rel2id[relation]].append(tokenized_sample)
                    else:
                        if i < len(rel_samples) // 5 and count <= 40:
                            count += 1
                            test_dataset[self.rel2id[relation]].append(tokenized_sample)
                        else:
                            count1 += 1
                            train_dataset[self.rel2id[relation]].append(tokenized_sample)
                            if count1 >= 320:
                                break

                    
            with open(self.save_data_path, "wb") as f:
                pickle.dump((train_dataset, val_dataset, test_dataset), f)
            return train_dataset, val_dataset, test_dataset

    def _read_relations(self, file):
        id2rel = json.load(open(file, "r", encoding="utf-8"))
        rel2id = {}
        for i, x in enumerate(id2rel):
            rel2id[x] = i
        return id2rel, rel2id


In [4]:
import json

file = "/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/datasets/data_with_marker_tacred.json"
data = json.load(open(file, "r", encoding="utf-8"))

In [5]:
import random

random.seed(args.seed)
print(args.seed)

2021


In [13]:
data = data_sampler(args, seed=2021)
list_data = []

for steps, (training_data, valid_data, test_data, current_relations, historic_test_data, seen_relations) in enumerate(data):
    
    task_x = []
    for relation in current_relations:
        list_data.append({
            'relation': relation,
            'text': [item['text'] for item in training_data[relation][5:10]]
        })
    # list_data.append(task_x)
    
        # for item in training_data[relation]:
        #     if item['relation'] == 0:
        #         print(item)
    # if training_data[current_relations[0]][21]['relation'] == 0:
    # print(training_data[current_relations[0]][21])
    print(current_relations)
    
    

['per:cities_of_residence', 'per:other_family', 'org:founded', 'per:origin']
['per:cause_of_death', 'org:dissolved', 'per:employee_of', 'org:member_of']
['per:parents', 'per:alternate_names', 'org:top_members/employees', 'per:siblings']
['per:stateorprovinces_of_residence', 'org:alternate_names', 'org:country_of_headquarters', 'per:country_of_birth']
['per:children', 'per:date_of_birth', 'org:founded_by', 'per:countries_of_residence']
['per:schools_attended', 'org:subsidiaries', 'org:members', 'org:political/religious_affiliation']
['org:stateorprovince_of_headquarters', 'per:charges', 'per:stateorprovince_of_birth', 'per:title']
['per:stateorprovince_of_death', 'org:number_of_employees/members', 'per:city_of_death', 'per:spouse']
['org:website', 'per:age', 'per:city_of_birth', 'per:date_of_death']
['org:shareholders', 'org:parents', 'org:city_of_headquarters', 'per:religion']


In [15]:
len(list_data)

json.dump(list_data, open('./prototype.json', 'w'), ensure_ascii=False)

In [18]:
import json

x = json.load(open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/datasets/description/all.json', 'r'))

In [39]:
text = """The relation name "org:stateorprovince_of_headquarters" refers to the specific type of relationship between an organization (E11, E12) and the state or province where that organization's headquarters are located (E21, E22). This relationship is identified from textual data that explicitly or implicitly mentions the geographical location of the main office or central operating facility of an organization. The examples provided illustrate various instances where the headquarters of an organization are linked to a particular state or province, showcasing the diversity in organizational types and the geographical spread of their central operations.

The "org:stateorprovince_of_headquarters" relation is pivotal for understanding the geographic distribution of organizational headquarters, which can offer insights into the regional economic impact, employment patterns, and the strategic decisions behind where companies choose to base their main operations. This relationship helps in mapping out the corporate landscape of different regions, indicating the concentration of industries, the preference of organizations for certain locales due to logistical, economic, or regulatory advantages, and the potential influence of these organizations on local economies and policies.

Understanding this relation is crucial for economic analysis, regional planning, and the study of corporate strategies. It aids in identifying how organizations are distributed across states or provinces, which can be essential for supply chain logistics, marketing strategies, and understanding the regional dynamics of business operations. This relation also has implications for tax policies, infrastructure development, and workforce distribution, offering a lens through which to view the interactions between corporations and the geographical contexts of their headquarters.
"""
for item in x:
    if item['relation'] == "org:stateorprovince_of_headquarters":
        print("Yes")
        item['text'] = text

Yes


In [4]:
import json

def save_to_jsonl(data, file_path):
    """
    Lưu dữ liệu vào một file JSONL.

    Parameters:
        data (list): Danh sách các đối tượng để lưu vào file JSONL.
        file_path (str): Đường dẫn tới file JSONL.

    Returns:
        None
    """
    with open(file_path, "w") as jsonl_file:
        for item in data:
            json.dump(item, jsonl_file)  # Ghi một đối tượng JSON vào file
            jsonl_file.write("\n")  # Viết dấu xuống dòng sau mỗi đối tượng

    print("Dữ liệu đã được lưu vào file JSONL:", file_path)
    
    
    
a = json.load(open('/home/luungoc/Thesis - 2023.2/train_bge.json', 'r'))


for item in a:
    item['neg'] = []
    
    
save_to_jsonl(a, './train_retrieval.jsonl')

Dữ liệu đã được lưu vào file JSONL: ./train_retrieval.jsonl


{'query': 'The relation "per:cities_of_residence" identifies and connects individuals (E11-E12) with cities or locations (E21-E22) where they reside or have resided. This relation is crucial for understanding geographical ties or the significance of certain locations in a person\'s life. Analyzing examples provided, we can observe the nature and application of this relation in various contexts.\nThis relation maps individuals to geographical locations where they live or have a significant presence. It\'s not limited to cities but can include towns, regions, or even specific areas like islands or rivers if they\'re central to the individual\'s life or activities. The relation is critical for understanding personal, social, and professional ties to specific places, contributing to a comprehensive view of a person\'s life and actions. It can be explicitly stated, as in an individual being mentioned as living in a place, or implied through significant involvement or influence in the area.\